t_start = Sys.time()
# CRAN packages:
library(tidyverse)
library(sf)
library(lubridate)
library(tidycensus)
library(ggExtra)
library(ggridges)
library(ggsn)
library(ragg) 
library(rstan)
library(drc)
library(spdep)
library(broom)
library(MASS)
library(spatialreg)
library(here)
library(pdftools)
library(matrixStats)
library(egg)
library(ggpubr)
library(scales)
library(qs)
library(corrplot)
library(readxl)
library(splines)
library(magic)
library(httr)
library(jsonlite)
library(DHARMa)
library(kableExtra)
library(lwgeom)
# Github packages available via: 
#   remotes::install_github("justlab/Just_universal")  
#   remotes::install_github("justlab/MTA_turnstile")
library(Just.universal) 

Session Configuration

#### SESSION CONFIGURATION ####
options(dplyr.summarise.inform=FALSE)

# data will default to a subfolder "data/" within working directory
# unless 1. set by an environment variable:
data.root = Sys.getenv("COVID_DATA")
# or 2. set with an alternative path here:
if (data.root == "") data.root = here("data")
if (!dir.exists(data.root)) dir.create(data.root)
print(paste("data being downloaded into directory", dQuote(data.root)))
## [1] "data being downloaded into directory \"/data/scratch/COVID_runtime/data\""
# Get some data from the git repository rather than downloading from original
#   source, to avoid changes in model results due to updated data
use_repo_data = TRUE

if(Sys.getenv("MTA_TURNSTILE_DATA_DIR") == ""){ 
  # set up default download location for MTA turnstile data
  mta_dir = file.path(data.root, "mta_turnstile")
  if(!dir.exists(mta_dir)) dir.create(mta_dir, recursive = TRUE)
  Sys.setenv(MTA_TURNSTILE_DATA_DIR = mta_dir)
}
library(MTA.turnstile)
## MTA.turnstile data directory: /data/scratch/COVID_runtime/data/mta_turnstile
t_turnstile_1 = Sys.time()

# output path for figures
fig.path = here("figures")
if(!dir.exists(fig.path)) dir.create(fig.path)

export.figs = TRUE 
if(export.figs) message("Saving figures to:\n ", fig.path) else message("Not saving figures")
## Saving figures to:
##  /data/scratch/COVID_runtime/figures
# To generate census data, you need an API key, which you can request here: https://api.census.gov/data/key_signup.html
#census_api_key("INSERT YOUR CENSUS API KEY HERE", install = TRUE) 
if(Sys.getenv("CENSUS_API_KEY")=="") warning("Census API Key Missing")

# pairmemo function cache
pairmemo.dir = file.path(data.root, "pairmemo")
dir.create(pairmemo.dir, showWarnings = F)
pm = function(...) pairmemo(
  directory = pairmemo.dir,
  n.frame = 2,
  ...)

About pairmemo:

pairmemo caches the results of functions on disk based on their input parameters.
If you run this script and then make changes to code, we recommend that you clear the pairmemo cache of any functions that run in the lines following your change before running the script again.
The following commented lines would clear the pairmemo cache for each function. If you do not change the input Census or Pluto data, you would likely only need to run the last one.

#pairmemo.clear(get.Pluto)
#pairmemo.clear(get.qn.blocks)
#pairmemo.clear(acs.main)
#pairmemo.clear(get_essential_acs)
#pairmemo.clear(get.tract.res)
#pairmemo.clear(fit_BWQS_model)

Functions

#### Functions ####
read_w_filenames <- function(flnm) {
  read_csv(flnm) %>%
    mutate(filename = flnm)
}

extract_waic <- function (stanfit){
  log_lik <- rstan::extract(stanfit, "log_lik")$log_lik
  dim(log_lik) <- if (length(dim(log_lik)) == 1) 
    c(length(log_lik), 1)
  else c(dim(log_lik)[1], prod(dim(log_lik)[2:length(dim(log_lik))]))
  S <- nrow(log_lik)
  n <- ncol(log_lik)
  lpd <- log(colMeans(exp(log_lik)))
  p_waic <- colVars(log_lik)
  elpd_waic <- lpd - p_waic
  waic <- -2 * elpd_waic
  loo_weights_raw <- 1/exp(log_lik - max(log_lik))
  loo_weights_normalized <- loo_weights_raw/matrix(colMeans(loo_weights_raw), 
                                                   nrow = S, ncol = n, byrow = TRUE)
  loo_weights_regularized <- pmin(loo_weights_normalized, sqrt(S))
  elpd_loo <- log(colMeans(exp(log_lik) * loo_weights_regularized)/colMeans(loo_weights_regularized))
  p_loo <- lpd - elpd_loo
  pointwise <- cbind(waic, lpd, p_waic, elpd_waic, p_loo, elpd_loo)
  total <- colSums(pointwise)
  se <- sqrt(n * colVars(pointwise))
  return(list(waic = total["waic"], elpd_waic = total["elpd_waic"], 
              p_waic = total["p_waic"], elpd_loo = total["elpd_loo"], 
              p_loo = total["p_loo"]))
}

# Download a file, update metadata records, and load it with function `f`
# File metadata is stored in a sqlite file, by default in data/downloads/meta.sqlite
download = function(url, to, f, ...){
    f(download.update.meta(url, file.path(data.root, "downloads"), to),
        ...)
}

Load Data

#### Load Data ####

Progress bars are a single line when run interactively, but print every refresh with knitr

# Subway ridership data
Subway_ridership_by_UHF <- relative.subway.usage(2020L, "nhood")
## Reading turnstile files
## Processing
## Parsing timestamps
## Sorting
## Setting station names
## Dropping 3 unlocated stations
## Dropping 278,230 observations
## Decumulating - entries
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
[manually edited for length]
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
## Dropped 189 sources of 5,292
## Dropped 1,102,556 observations of 60,962,516
## Decumulating - exits
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
[manually edited for length]
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
## Dropped 173 sources of 5,292
## Dropped 868,952 observations of 60,962,516
t_turnstile_2 = Sys.time()

# get the Pluto dataset from #https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page 
pm(fst = T,
get.Pluto <- function() download(
    "https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_pluto_20v3_csv.zip",
    "pluto.zip",
    function(p)
        read_csv(unz(p, "pluto_20v3.csv"), col_types = cols(spdist2 = col_character(), 
                                                overlay2 = col_character(),
                                                zonedist4 = col_character()))[,
            c("landuse", "bbl", "numfloors", "unitstotal", "unitsres",
                "zipcode")]))
Pluto <- as.data.frame(get.Pluto())
# If you get an error in running this function, the download may be incomplete/corrupt
# In that case, delete the pluto.zip file and try again:
#unlink(file.path(data.root, "downloads", "pluto.zip"))

if (file.exists(file.path(data.root, "Bldg_Footprints.qs"))) {
    Bldg_Footprints <- qread(file.path(data.root, "Bldg_Footprints.qs"))
} else {
  Bldg_Footprints <- download(
    # https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh
    "https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=Shapefile",
    "building_footprints.zip",
    function(p)
      st_read(paste0("/vsizip/", p)))
  qsave(Bldg_Footprints, file.path(data.root, "Bldg_Footprints.qs"))
}
## Multiple layers are present in data source /vsizip//data/scratch/COVID_runtime/data/downloads/building_footprints.zip, reading layer `geo_export_aad3a557-b481-4681-b754-34463d92a741'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE, noBreaks. =
## FALSE, : automatically selected the first layer in a data source containing more
## than one.
## Reading layer `geo_export_aad3a557-b481-4681-b754-34463d92a741' from data source `/vsizip//data/scratch/COVID_runtime/data/downloads/building_footprints.zip' using driver `ESRI Shapefile'
## Simple feature collection with 1084927 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -74.2555 ymin: 40.49843 xmax: -73.70006 ymax: 40.91505
## CRS:            4326
ZCTA_by_boro <- download(
    "https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm",
    "uhf_neighborhoods.html",
    function(p)
       {# XML::readHTMLTable doesn't identify the columns correctly.
        x = str_match_all(read_file(p), regex(dotall = T, paste0(
            '<td headers="header1"[^>]+>\\s*(.+?)</td>',
            '(.+?)',
            '(?=<td headers="header1"|</table>)')))[[1]]
        do.call(rbind, lapply(1 : nrow(x), function(i)
            data.frame(boro = x[i, 2], zip = as.integer(
                str_extract_all(x[i, 3], "\\b\\d{5}\\b")[[1]]))))})

# Download the specific day of test results by ZCTA being used
ZCTA_test_download <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/6d7c4a94d6472a9ffc061166d099a4e5d89cd3e3/tests-by-zcta.csv",
  "2020-05-07_tests-by-zcta.csv",
  identity
)

# Download COVID-19 testing data 
ZCTA_test_series <- ZCTA_test_download %>% 
  map_df(~read_w_filenames(.)) %>%
  mutate(date = as.Date(str_extract(filename, "[:digit:]{4}-[:digit:]{2}-[:digit:]{2}"))) %>%
  dplyr::select(-filename)
## Parsed with column specification:
## cols(
##   MODZCTA = col_double(),
##   Positive = col_double(),
##   Total = col_double(),
##   zcta_cum.perc_pos = col_double()
## )
# UHF definitions by zip codes
UHF_ZipCodes <- UHF_ZipCodes <- download(
    "http://www.infoshare.org/misc/UHF.pdf",
    "uhf_zips.pdf",
    function(p)
       {x = str_match_all(pdf_text(p)[2],
            "(\\d+)\\s+(\\S.+?\\S)\\s*([0-9,]+)")[[1]]
        do.call(rbind, lapply(1 : nrow(x), function(i)
            data.frame(code = x[i, 2], name = x[i, 3], zip = as.integer(
                str_extract_all(x[i, 4], "\\b\\d{5}\\b")[[1]]))))})

# UHF shapefile 
UHF_shp <- download(
    "https://www1.nyc.gov/assets/doh/downloads/zip/uhf42_dohmh_2009.zip",
    "nyc_uhf_nhoods_shapefile.zip",
    function(p) read_sf(paste0("/vsizip/", p, "/UHF_42_DOHMH_2009")))

# NYC boroughs from NYC Open Data
NYC_basemap_shp <- download(
  "https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=Shapefile",
  "Borough_Boundaries.zip",
  function(p){
    unzip(p, exdir = file.path(data.root, "downloads"))
    # open data platform generates a random UUID for every download
    ufile = list.files(file.path(data.root, "downloads"), pattern = "geo_export_.*\\.shp", full.names = TRUE)
    st_read(ufile, stringsAsFactors = FALSE, quiet = TRUE) %>% st_transform(., crs = 2263)
  } 
)

# DOHMH MODZCTA Shapefile
MODZCTA_NYC_shp <- download(
  "https://data.cityofnewyork.us/api/geospatial/pri4-ifjk?method=export&format=Shapefile",
  "Modified Zip Code Tabulation Areas (MODZCTA).zip", 
  function(p) read_sf(paste0("/vsizip/", p))
)

# Food outlets 
if(use_repo_data){
  food_retail <- read_csv(here("data", "retail_food_stores_2019-06-13.csv"))
} else {
  food_retail <- download(
      "https://data.ny.gov/api/views/9a8c-vfzj/rows.csv",
      "retail_food_stores.csv",
      read_csv)
}
## Parsed with column specification:
## cols(
##   County = col_character(),
##   `License Number` = col_character(),
##   `Operation Type` = col_character(),
##   `Establishment Type` = col_character(),
##   `Entity Name` = col_character(),
##   `DBA Name` = col_character(),
##   `Street Number` = col_character(),
##   `Street Name` = col_character(),
##   `Address Line 2` = col_logical(),
##   `Address Line 3` = col_logical(),
##   City = col_character(),
##   State = col_character(),
##   `Zip Code` = col_double(),
##   `Square Footage` = col_double(),
##   Location = col_character()
## )
# Download deaths by ZCTA as of May 23rd
deaths_by23May2020_by_zcta <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/8d88b2c06cf6b65676d58b28979731faa10c193c/data-by-modzcta.csv",
  "2020-05-23_data-by-modzcta.csv",
  read_csv
)
## Parsed with column specification:
## cols(
##   MODIFIED_ZCTA = col_double(),
##   NEIGHBORHOOD_NAME = col_character(),
##   BOROUGH_GROUP = col_character(),
##   COVID_CASE_COUNT = col_double(),
##   COVID_CASE_RATE = col_double(),
##   POP_DENOMINATOR = col_double(),
##   COVID_DEATH_COUNT = col_double(),
##   COVID_DEATH_RATE = col_double(),
##   PERCENT_POSITIVE = col_double()
## )
# download MODZCTA to ZCTA crosswalk, current version from repo
modzcta_to_zcta <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/master/Geography-resources/ZCTA-to-MODZCTA.csv",
  "ZCTA-to-MODZCTA.csv",
  read_csv
)
## Parsed with column specification:
## cols(
##   ZCTA = col_double(),
##   MODZCTA = col_double()
## )
modzcta_to_zcta1 <- modzcta_to_zcta %>% mutate(ZCTA = as.character(ZCTA))
modzcta_to_zcta2 <- modzcta_to_zcta1 %>% mutate(MODZCTA = as.character(MODZCTA))
MODZCTAs_in_NYC <- as.character(unique(ZCTA_test_series$MODZCTA))
ZCTAs_in_NYC <- as.character(unique(modzcta_to_zcta$ZCTA))

# download ZIP to Tract crosswalk from HUD
zip_to_tract <- download( 
  "https://www.huduser.gov/portal/datasets/usps/ZIP_TRACT_062020.xlsx",
  "ZIP_TRACT_062020.xlsx",
  function(p) suppressWarnings(read_excel(path = p, col_types = c("text", "text", "numeric", "skip", "skip", "skip")))
) 

# Block to ZCTA and County crosswalk for NY
ny_xwalk <- download("https://lehd.ces.census.gov/data/lodes/LODES7/ny/ny_xwalk.csv.gz",
                     "ny_xwalk.csv.gz",
                     function(p) {
                       zzf = gzfile(p)
                       read.csv(zzf) %>% 
                         dplyr::select(cty, tabblk2010, zcta, blklondd, blklatdd) %>% 
                         mutate(tabblk2010 = as.character(tabblk2010), 
                                zcta = as.character(zcta),
                                cntyfips = as.character(cty)) %>% 
                         dplyr::select(-cty)
                     }
)

# We have many sources of data, so these just help to combine the various data types
NYC_counties1 <- c("Bronx","Kings","Queens","New York","Richmond")
NYC_counties1_full <- c("Bronx County","Kings County","Queens County","New York County","Richmond County")
NYC_boro_county_match <- tibble(County = c("Bronx","Kings","Queens","New York","Richmond"), 
                                boro = c("Bronx","Brooklyn","Queens","Manhattan","Staten Island"), 
                                full_county = c("Bronx County","Kings County","Queens County","New York County","Richmond County"),
                                fips = c("36005", "36047", "36081", "36061", "36085"))

# stan model script
BWQS_stan_model <- here("code", "nb_bwqs_cov.stan") 

Census Data

#### Census Data ####

# function to pull 2010 block population for Queens & Nassau counties
pm(get.qn.blocks <- function(){
  nassau_blk_pop <- get_decennial(geography = "block", variables = "P001001", 
                                  state = "NY", county = "Nassau", geometry = FALSE)
  queens_blk_pop <- get_decennial(geography = "block", variables = "P001001", 
                                  state = "NY", county = "Queens", geometry = FALSE)
  bind_rows(nassau_blk_pop, queens_blk_pop) %>% 
    dplyr::select(GEOID, value) %>% rename("pop2010" = "value")
})

# function to pull 2018 ACS data 
pm(acs.main <- function(admin_unit = c("zcta", "tract"), state_unit = c(NULL, "NY"), sf_shapes = c(TRUE, FALSE)) {
     ACS_Data <- get_acs(geography = admin_unit,
                         state = state_unit,
                         geometry = sf_shapes,
                         variables = c(medincome = "B19013_001",
                                       total_pop1 = "B01003_001",
                                       fpl_100 = "B06012_002", 
                                       fpl_100to150 = "B06012_003",
                                       median_rent = "B25031_001",
                                       total_hholds1 = "B22003_001",
                                       hholds_snap = "B22003_002",
                                       over16total_industry1 = "C24050_001",
                                       ag_industry = "C24050_002",
                                       construct_industry = "C24050_003",
                                       manufact_industry = "C24050_004",
                                       wholesaletrade_industry = "C24050_005",
                                       retail_industry = "C24050_006",
                                       transpo_and_utilities_industry = "C24050_007",
                                       information_industry = "C24050_008",
                                       finance_and_realestate_industry = "C24050_009",
                                       science_mngmt_admin_industry = "C24050_010",
                                       edu_health_socasst_industry = "C24050_011",
                                       arts_entertain_rec_accomodate_industry = "C24050_012",
                                       othsvcs_industry = "C24050_013",
                                       publicadmin_industry = "C24050_014",
                                       total_commute1 = "B08301_001",
                                       drove_commute = "B08301_002",
                                       pubtrans_bus_commute = "B08301_011",
                                       pubtrans_subway_commute = "B08301_013",
                                       pubtrans_railroad_commute = "B08301_013",
                                       pubtrans_ferry_commute = "B08301_015",
                                       taxi_commute = "B08301_016",
                                       bicycle_commute = "B08301_018",
                                       walked_commute = "B08301_019",
                                       workhome_commute = "B08301_021",
                                       unemployed = "B23025_005",
                                       under19_noinsurance = "B27010_017",
                                       age19_34_noinsurance = "B27010_033",
                                       age35_64_noinsurance = "B27010_050",
                                       age65plus_noinsurance = "B27010_066",
                                       hisplat_raceethnic = "B03002_012",
                                       nonhispLat_white_raceethnic = "B03002_003",
                                       nonhispLat_black_raceethnic = "B03002_004",
                                       nonhispLat_amerindian_raceethnic = "B03002_005",
                                       nonhispLat_asian_raceethnic = "B03002_006",
                                       age65_plus  = "B08101_008"),
                         year = 2018,
                         output = "wide",
                         survey = "acs5")
     
     if(admin_unit=="zcta"){
       ACS_Data <- ACS_Data %>% #only pull out the estimates and cleaning variable names
         filter(GEOID %in% ZCTAs_in_NYC) %>%
         dplyr::select(-NAME)  %>%
         dplyr::select(GEOID, !ends_with("M")) %>%
         rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2)))
     }
     
     if(admin_unit=="tract"){
       ACS_Data <- ACS_Data %>% #only pull out the estimates and cleaning variable names
         filter(substr(GEOID,1,5) %in% NYC_boro_county_match$fips) %>% # Tracts in NYC counties
         dplyr::select(-NAME)  %>%
         dplyr::select(GEOID, !ends_with("M")) %>%
         rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2)))
     }
     
     return(ACS_Data)
   })

# Use 2010 block population to scale Nassau County ZCTA contribution to NYC MODZCTAs
# This is the method used according to data dictionary at NYC Open Data:
#   https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk
nassau_zcta_weights <- function(zcta_acs, mz_to_z, blk_to_z){
  blk_pop = get.qn.blocks()
  
  modzcta_span = c("11429", "11411", "11004") # MODZCTA with ZCTAs from both Queens & Nassau
  border_zcta <- mz_to_z %>% filter(MODZCTA %in% modzcta_span) %>% pull(ZCTA)
  # Filter block-zcta crosswalk table to Queens-Nassau ZCTA of interest
  blk_to_z <- blk_to_z %>% filter(zcta %in% border_zcta)
  
  # Join population to block-zcta crosswalk
  xwalk_pop <- blk_to_z %>% left_join(blk_pop, by = c("tabblk2010" = "GEOID")) 
  
  # Summarise 2010 population by ZCTA to calculate proportions inside NYC
  zcta_pop_2010 <- xwalk_pop %>%
    group_by(zcta) %>%
    summarise(z_pop_2010 = sum(pop2010), .groups = "drop_last")
  queens_zcta_pop_2010 <- xwalk_pop %>%
    filter(cntyfips == "36081") %>%
    group_by(zcta) %>%
    summarise(queens_z_pop_2010 = sum(pop2010), .groups = "drop_last")
  zcta_pop_props <- queens_zcta_pop_2010 %>% 
    left_join(zcta_pop_2010, by = "zcta") %>% 
    mutate(in_NYC_prop = queens_z_pop_2010/z_pop_2010)
  
  zcta_weights <- zcta_acs %>% dplyr::select(GEOID) %>% 
    left_join(zcta_pop_props, by = c("GEOID" = "zcta")) %>%
    dplyr::select(GEOID, in_NYC_prop) %>% 
    mutate(in_NYC_prop = 
             case_when(is.na(in_NYC_prop) ~ 1,
                       TRUE               ~ in_NYC_prop))
  
  # Apply weights for all Census variables except median vars
  zcta_acs <- zcta_acs %>% left_join(zcta_weights, by = "GEOID") 
  varvec <- 1:ncol(zcta_acs)
  varvec <- varvec[-grep("GEOID|in_NYC_prop|medincome|median_rent", names(zcta_acs))]
  zcta_acs <- zcta_acs %>% mutate_at(vars(all_of(varvec)), ~ . * in_NYC_prop)
}

# function to clean ACS data
clean_acs_data_and_derive_vars <- function(df, admin_unit = c("zcta", "tract")){
  if(admin_unit=="zcta"){
    ACS_Data1a <- df %>%
      left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
      group_by(MODZCTA) %>%
      summarise_at(vars(medincome, median_rent), ~weighted.mean(., total_pop1, na.rm = T)) %>% 
      rename(zcta = "MODZCTA")
    
    ACS_Data1b <- df %>%
      left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
      group_by(MODZCTA) %>%
      summarise_at(vars(total_pop1:fpl_100to150, total_hholds1:age65_plus), ~sum(.)) %>%
      mutate_at(vars(ends_with("_commute")), ~round((./total_commute1)*100, 2)) %>% #proportion of people relying on a given mode of transit
      mutate_at(vars(ends_with("_raceethnic")), ~round((./total_pop1)*100, 2)) %>% #proportion of ppl reporting a given race/ethncity 
      mutate(not_insured = round(((under19_noinsurance + age19_34_noinsurance + age35_64_noinsurance + age65plus_noinsurance) / total_pop1)*100, 2), #proportion uninsured
             #snap_hholds = round((hholds_snap/total_hholds1)*100, 2), #proportion relying on SNAP
             #fpl_150 = round(((fpl_100+fpl_100to150)/total_pop1)*100, 2), #proportion 150% or less of FPL
             unemployed = round((unemployed/over16total_industry1)*100, 2), #proportion unemployed
             not_quarantined_jobs = round(((ag_industry+(construct_industry*.25)+wholesaletrade_industry+ #an estimate of who is still leaving the house for work based on industry
                                              (edu_health_socasst_industry*.5)+transpo_and_utilities_industry)/over16total_industry1)*100, 2)) %>%
      dplyr::select(-ends_with("_noinsurance"), -fpl_100, -fpl_100to150, -ends_with("_industry"), -hholds_snap) %>%
      rename(zcta = "MODZCTA") 
    
    ACS_Data2 <- left_join(ACS_Data1a, ACS_Data1b, by = "zcta") %>%
      mutate(zcta = as.character(zcta))
    
  } else{
    
    ACS_Data2 <- df %>%
      mutate_at(vars(ends_with("_commute")), ~round((./total_commute1)*100, 2)) %>% #proportion of people relying on a givenmode of transit
      mutate_at(vars(ends_with("_raceethnic")), ~round((./total_pop1)*100, 2)) %>% #proportion of ppl reporting a given race/ethncity 
      mutate(not_insured = round(((under19_noinsurance + age19_34_noinsurance + age35_64_noinsurance + age65plus_noinsurance) / total_pop1)*100, 2), #proportion uninsured
             unemployed = round((unemployed/over16total_industry1)*100, 2), #proportion unemployed
             not_quarantined_jobs = round(((ag_industry+(construct_industry*.25)+wholesaletrade_industry+ #an estimate of who is still leaving the house for work based on industry
                                              (edu_health_socasst_industry*.5)+transpo_and_utilities_industry)/over16total_industry1)*100, 2)) %>%
      dplyr::select(-ends_with("_noinsurance"), -ends_with("_industry"),-fpl_100, -fpl_100to150,-hholds_snap) 
  }
  
  return(ACS_Data2)
}

# Functions to pull mode of transportation for our approximate of essential workers
pm(fst = T, 
   get_essential_acs <- function(admin_unit, state_unit) {
     get_acs(geography = admin_unit, #pull down the relevant categories 
             state = state_unit,
             variables = c(ag_car1_commute = "B08126_017",
                           ag_pubtrans_commute = "B08126_047",
                           construct_car1_commute ="B08126_018",
                           construct_pubtrans_commute = "B08126_048",
                           wholesale_car1_commute = "B08126_020",
                           wholesale_pubtrans_commute = "B08126_050",
                           transpo_car1_commute = "B08126_022",
                           transpo_pubtrans_commute = "B08126_052",
                           ed_hlthcare_car1_commute = "B08126_026",
                           ed_hlthcare_pubtrans_commute = "B08126_056"),
             year = 2018, 
             output = "wide",
             survey = "acs5")
})

acs.essential <- function(admin_unit, zcta_pop = NULL, state_unit = NULL) {
  if(!admin_unit %in% c("zcta", "tract")) stop("admin_unit must be either 'zcta' or 'tract'")
  if(admin_unit == "tract" & is.null(state_unit)) stop("state_unit must be set to download tracts")
  if(!is.null(state_unit)) if(state_unit != "NY") stop("state_unit must be either NULL or 'NY'")
  
  ACS_EssentialWrkr_Commute <- get_essential_acs(admin_unit = admin_unit, state_unit = state_unit)
     
   if(admin_unit == "zcta"){
     if(is.null(zcta_pop)) stop("zcta_pop must be set for scaling MODZCTA on Queens/Nassau boundary")
     
     ACS_Essential_worker_estimates <- ACS_EssentialWrkr_Commute %>% #clean data and aggregate 
       dplyr::select(-ends_with("M"), -NAME) %>%
       filter(GEOID %in% ZCTAs_in_NYC) %>%
       # scale ZCTAs by proportion of population in NYC
       right_join(zcta_pop %>% dplyr::select("GEOID", "in_NYC_prop"), by = "GEOID") %>% 
       mutate_at(vars(2:11), ~ . * in_NYC_prop) %>% 
       # summarize ZCTA to MODZCTA
       left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
       group_by(MODZCTA) %>%
       summarise_at(vars(2:11),
                    ~ sum(., na.rm = T)) %>% 
       rename(zcta = "MODZCTA") %>%
       mutate_at(vars(starts_with("ed_hlthcare")), ~ round(. / 2), 0) %>% #maintain same proportions as estimated nonquarintined jobs
       mutate_at(vars(starts_with("construct")), ~ round(. / 4), 0) %>%
       mutate(
         essentialworker_drove = rowSums(dplyr::select(., contains("car1_commute"))),
         essentialworker_pubtrans = rowSums(dplyr::select(., contains("pubtrans")))) %>%
       dplyr::select(zcta, essentialworker_drove, essentialworker_pubtrans) %>%
       mutate(zcta = as.character(zcta))
   } 
   else { # tracts
       
     ACS_Essential_worker_estimates <- ACS_EssentialWrkr_Commute %>% #clean data and aggregate 
         dplyr::select(-ends_with("M"), -NAME) %>%
         filter(substr(GEOID,1,5) %in% NYC_boro_county_match$fips) %>% # Tracts in NYC counties
         mutate_at(vars(starts_with("ed_hlthcare")), ~round(./2), 0) %>% #maintain same proportions as estimated nonquarintined jobs
         mutate_at(vars(starts_with("construct")), ~round(./4), 0) %>%
         mutate(essentialworker_drove = rowSums(dplyr::select(., contains("car1_commute"))), 
                essentialworker_pubtrans = rowSums(dplyr::select(., contains("pubtrans")))) %>%
         dplyr::select(GEOID, essentialworker_drove, essentialworker_pubtrans)   
   }
     
   return(ACS_Essential_worker_estimates)
}


# ZCTA CENSUS DATA
options(tigris_use_cache = TRUE)
ACS_Data1 <- as.data.frame(acs.main("zcta", NULL, FALSE)) #download the zcta data
## Getting data from the 2014-2018 5-year ACS
ACS_Data_scaled <- nassau_zcta_weights(ACS_Data1, modzcta_to_zcta2, ny_xwalk)
## Getting data from the 2010 decennial Census
## Getting data from the 2010 decennial Census
ACS_Data2 <- clean_acs_data_and_derive_vars(ACS_Data_scaled, "zcta")
ACS_EssentialWrkr_Commute1 = as.data.frame(acs.essential("zcta", zcta_pop = ACS_Data_scaled, state_unit = NULL))
## Getting data from the 2014-2018 5-year ACS
print(paste("The 2018 5-year ACS population range in NYC MODZCTAs is:", paste(range(ACS_Data2$total_pop1), collapse = "-")))
## [1] "The 2018 5-year ACS population range in NYC MODZCTAs is: 3028-112425"
# TRACT CENSUS DATA  
acs_tracts <- acs.main("tract", "NY", TRUE)
## Getting data from the 2014-2018 5-year ACS
acs_tracts2 <- clean_acs_data_and_derive_vars(acs_tracts, "tract")
acs_tracts_commute1 = as.data.frame(acs.essential("tract", state_unit = "NY"))
## Getting data from the 2014-2018 5-year ACS
t_census = Sys.time()

Grocery stores per area

#### Grocery stores per area ####

non_supermarket_strings <- c("DELI|TOBACCO|GAS|CANDY|7 ELEVEN|7-ELEVEN|LIQUOR|ALCOHOL|BAKERY|CHOCOLATE|DUANE READE|WALGREENS|CVS|RITE AID|RAVIOLI|WINERY|WINE|BEER|CAFE|COFFEE")

food_retail_filtered <- food_retail %>% 
  filter(County %in% NYC_boro_county_match$County) %>% 
  filter(str_detect(`Establishment Type`, "J") & str_detect(`Establishment Type`, "A") & str_detect(`Establishment Type`, "C") &
           !str_detect(`Establishment Type`, "H")) %>%
  filter(!str_detect(`Entity Name`, non_supermarket_strings) & !str_detect(`DBA Name`, non_supermarket_strings)) %>%
  filter(`Square Footage`>=4500) %>%
  mutate(zcta = as.character(str_extract(Location, "[:digit:]{5}"))) %>% 
  mutate(Address = case_when(
    # some locations geocode better when address includes city name
    `License Number` %in% c("638599", "712410", "706967", "710078") ~ 
      paste(paste(`Street Number`, `Street Name`), City, State, zcta, sep = ","),
    # but most geocode better without it, see limitation #4 at: http://gis.ny.gov/gisdata/inventories/details.cfm?DSID=1278
    TRUE ~ 
      paste(paste(`Street Number`, `Street Name`), State, zcta, sep = ",") )
  )

# Geocode grocers, using a cached version if available to make analysis reproducible
# The geocoding service may be updated in the future and give different results
cached_grocers = file.path(data.root, "grocers_geocode_2020-11-09.csv")
if(file.exists(cached_grocers) & use_repo_data){
  gctable <- read.csv(cached_grocers)
  failed = which(gctable$score == 0)
  message("Loaded cached geocoded grocers: ", nrow(gctable)-length(failed), "/", nrow(gctable), " have coordinates.") # 997/1037
  if(nrow(gctable) != nrow(food_retail_filtered)) warning("Cached geocoded table has different row count than non-geocoded table")
} else {
  # locations are returned in crs=26918, UTM 18N NAD83
  api = "https://gisservices.its.ny.gov/arcgis/rest/services/Locators/Street_and_Address_Composite/GeocodeServer/findAddressCandidates?f=json&maxLocations=1&SingleLine="
  message("Geocoding ", nrow(food_retail_filtered), " grocers with NY State ITS geocoder...")
  t1 = Sys.time()
  res = lapply(food_retail_filtered$Address, function(addr) {
    GET(url = paste0(api, URLencode(addr))) })
  tdiff = Sys.time() - t1
  # extract results
  geocodes = lapply(res, function(page) fromJSON(rawToChar(page$content), flatten = TRUE)$candidates)
  failed = which(sapply(geocodes,class) != "data.frame")
  geocodes[failed] <- lapply(1:length(failed), function(void){
    data.frame(address = NA_character_, score = 0, location.x = NA_real_, location.y = NA_real_)})
  gctable = bind_rows(geocodes)
  message("Geocoded ", nrow(food_retail_filtered)-length(failed), "/", nrow(food_retail_filtered), 
          " grocers in ", round(as.numeric(tdiff),1), " ", attributes(tdiff)$units)
  write.csv(gctable, paste0(data.root, "/grocers_geocode_", Sys.Date(), ".csv"))
}
## Loaded cached geocoded grocers: 997/1037 have coordinates.
# Count grocers by tract
gctable = filter(gctable, score > 0)
grocerSF = st_as_sf(gctable, coords = c("location.x", "location.y"), crs = 26918) %>% st_transform(crs = 2263)
tractSF = acs_tracts2[, "GEOID"] %>% st_transform(., crs = 2263)
tract_grocers = suppressWarnings(st_intersection(tractSF, grocerSF)) %>%
  st_set_geometry(., NULL) %>%
  group_by(GEOID) %>%
  summarise(grocers = n_distinct(`address`))
#nrow(tract_grocers) # 749
#sum(tract_grocers$grocers) # 993

# Count grocers by ZCTA 
zctaSF <- MODZCTA_NYC_shp %>% dplyr::select(modzcta, geometry) %>% st_transform(crs = 2263) 
zcta_grocers <- suppressWarnings(st_intersection(zctaSF, grocerSF)) %>%
  st_set_geometry(., NULL) %>%
  group_by(modzcta) %>%
  summarise(grocers = n_distinct(`address`))
# sum(zcta_grocers$grocers) # 993
# nrow(zcta_grocers) # 172
# range(zcta_grocers$grocers) # 1, 21

Subway station locations

#### Subway station locations ####

SubwayStation_shp <- as_tibble(turnstile()$stations) %>%
  st_as_sf(., coords = c("lon", "lat"), crs = 4269) %>%
  st_transform(., crs = 2263) %>%
  filter(!str_detect(ca, "PTH")) #removing New Jersey PATH stations

Residential area

#### Residential area ####

Pluto_ResOnly <- Pluto %>%
  filter(landuse>="01" & landuse<="04") %>%
  mutate(base_bbl = as.character(bbl)) %>%
  dplyr::select(-bbl)

ResBBLs <- as.character(Pluto_ResOnly$base_bbl)

# zcta-level residential area
Res_Bldg_Footprints <- Bldg_Footprints %>%  
  st_set_geometry(., NULL) %>%
  mutate(base_bbl = as.character(base_bbl)) %>%
  filter(base_bbl %in% ResBBLs &
           feat_code == "2100") %>%
  mutate(bldg_volume = shape_area * heightroof) %>%
  left_join(., Pluto_ResOnly, by = "base_bbl") %>%
  mutate(bldg_volume = if_else(is.na(bldg_volume), shape_area*numfloors*10, bldg_volume),
         res_volume = (bldg_volume/unitstotal)*unitsres, 
         zcta = as.character(zipcode)) %>%
  group_by(zcta) %>%
  summarise(total_res_volume_zcta = sum(res_volume, na.rm = TRUE)) 

# tract-level residential area 
Res_Bldg_Footprints2 <- Bldg_Footprints %>% 
  st_transform(crs = 2263) %>%
  suppressWarnings(st_centroid(of_largest_polygon = TRUE)) %>%
  mutate(base_bbl = as.character(base_bbl)) %>%
  filter(base_bbl %in% ResBBLs &
           feat_code == "2100") %>%
  mutate(bldg_volume = shape_area * heightroof) %>%
  left_join(., Pluto_ResOnly, by = "base_bbl") %>%
  mutate(bldg_volume = if_else(is.na(bldg_volume), shape_area*numfloors*10, bldg_volume),
         res_volume = (bldg_volume/unitstotal)*unitsres)
pm(get.tract.res <- function(res, tracts) st_intersection(res, tracts)) # takes a few minutes
res_bldg_tract <- get.tract.res(Res_Bldg_Footprints2, tractSF)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
res_bldg_tract_sum <- st_set_geometry(res_bldg_tract, NULL) %>%
  group_by(GEOID) %>%
  summarise(total_res_volume_tract = sum(res_volume, na.rm = TRUE))
#nrow(res_bldg_tract_sum) # 2132

COVID Tests

#### COVID Tests ####

MODZCTA_NYC_shp1 <- MODZCTA_NYC_shp %>%
  dplyr::select(modzcta, geometry) %>%
  rename("zcta" = "modzcta")

May7_tests <- ZCTA_test_series %>%
  filter(date=="2020-05-07") %>%
  mutate(zcta = as.character(MODZCTA)) %>%
  rename(total_tests = "Total") %>%
  dplyr::select(zcta, date, Positive, total_tests)

ZCTA_by_boro1 <- ZCTA_by_boro %>%
  mutate(boro = as.character(boro),
         zcta = as.character(zip)) %>%
  dplyr::select(-zip) %>%
  bind_rows(., 
            tibble(boro = as.character(c("Manhattan", "Manhattan" ,"Queens")), #correcting nas
                     zcta = as.character(c("10069", "10282", "11109"))))

# get water mask for maps
source(here("code/water_mask.R"))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
# Figure 3B - Map of Tests by MODZCTA

theme_smallmaps <- theme(legend.title = element_text(face = "bold", size = 9), 
                         plot.title = element_text(size = 9.5),
                         panel.background = element_rect(fill = "#cccccc"), 
                         panel.grid = element_blank(),
                         legend.background = element_rect(fill = "transparent"),
                         legend.position = c(0.22, 0.80),
                         legend.text = element_text(size = 8.5),
                         plot.margin = unit(c(4,0,4,0), "pt"),
                         legend.key.size = unit(1.1, "lines"),
                         axis.text = element_blank(),
                         axis.ticks = element_blank(),
                         axis.ticks.length = unit(0, "pt"),
                         axis.title = element_blank())

fig3b <- MODZCTA_NYC_shp1 %>%
  left_join(., May7_tests, by = "zcta") %>%
  left_join(., ACS_Data2, by = "zcta") %>%
  filter(zcta != "99999") %>%
  mutate(pos_per_100000 = (Positive/total_pop1)*100000) %>%
  ggplot() +
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(aes(fill = pos_per_100000), lwd = 0.2)+
  scalebar(MODZCTA_NYC_shp1, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5, 
           anchor = c(x = -73.71, y = 40.51)) + 
  labs(fill = "Positives per 100,000") +
  ggtitle("Cumulative positive COVID tests by zip code (May 7, 2020)") +
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(MODZCTA_NYC_shp1),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  theme_bw(base_size = 6) + 
  theme_smallmaps

Create data frames of all above information

#### Create data frames of all above information ####

ZCTA_ACS_COVID_shp <- MODZCTA_NYC_shp1 %>%
  st_transform(., crs = 2263) %>%
  dplyr::select(zcta, geometry) %>%
  left_join(., ACS_Data2, by = "zcta") %>%
  left_join(., May7_tests, by = "zcta") %>%
  left_join(., Res_Bldg_Footprints, by = "zcta") %>%
  left_join(., ACS_EssentialWrkr_Commute1, by = "zcta") %>%
  left_join(., zcta_grocers, by = c("zcta" = "modzcta")) %>%
  mutate(pop_density = as.numeric(total_pop1/st_area(geometry)),
         avg_hhold_size = round((total_pop1/total_hholds1), 2),
         pos_per_100000 = (Positive/total_pop1)*100000,
         testing_ratio = (total_tests/total_pop1),
         res_vol_zctadensity = as.numeric(total_res_volume_zcta/st_area(geometry)), 
         res_vol_popdensity = as.numeric(total_pop1/total_res_volume_zcta),
         pubtrans_ferrysubway_commute = pubtrans_subway_commute + pubtrans_ferry_commute,
         grocers = replace_na(grocers, 0),
         grocers_per_1000 = (grocers/total_pop1)*1000,
         pos_per_100000 = round(pos_per_100000, 0),
         valid_var = "0",
         one_over_grocers_per_1000 = if_else(is.infinite(1/grocers_per_1000), (1/.0293)+1, 1/grocers_per_1000), #max + 1 for zero values
         didnot_workhome_commute = 100 - workhome_commute,
         one_over_medincome = 1/medincome) %>%
  dplyr::select(-pubtrans_subway_commute, -pubtrans_ferry_commute) %>%
  left_join(., ZCTA_by_boro1, by = "zcta") %>%
  mutate_at(vars(starts_with("essentialworker_")), ~round((./over16total_industry1)*100, 2)) %>%
  filter(zcta != "99999") #remove na

ZCTA_ACS_COVID <- ZCTA_ACS_COVID_shp %>%
  st_set_geometry(., NULL) #remove geometry

tract_vars <- tractSF %>% # uses local CRS
  left_join(., st_set_geometry(acs_tracts2, NULL), by = "GEOID") %>%
  left_join(., acs_tracts_commute1, by = "GEOID") %>%
  left_join(., res_bldg_tract_sum, by = "GEOID") %>%
  left_join(., tract_grocers, by = "GEOID") %>%
  mutate(pop_density = as.numeric(total_pop1/st_area(geometry)),
         avg_hhold_size = round((total_pop1/total_hholds1), 2),
         res_vol_tractdensity = as.numeric(total_res_volume_tract/st_area(geometry)), 
         res_vol_popdensity = as.numeric(total_pop1/total_res_volume_tract),
         pubtrans_ferrysubway_commute = pubtrans_subway_commute + pubtrans_ferry_commute,
         grocers = replace_na(grocers, 0),
         grocers_per_1000 = (grocers/total_pop1)*1000,
         valid_var = "0",
         didnot_workhome_commute = 100 - workhome_commute,
         one_over_grocers_per_1000 = if_else(is.infinite(1/grocers_per_1000), (1/.0293)+1, 1/grocers_per_1000),
         one_over_medincome = 1/medincome) %>% 
  dplyr::select(-pubtrans_subway_commute, -pubtrans_ferry_commute) %>%
  mutate_at(vars(starts_with("essentialworker_")), ~round((./over16total_industry1)*100, 2))

Tract to ZCTA weighted assignment

#### Tract to ZCTA weighted assignment ####
# Shares of residential addresses in ZCTAs by Tract are later used to select
# representative tract-level SES values at a specified location on the ECDF

modzcta_to_zcta_chr <- data.frame(ZCTA = as.character(modzcta_to_zcta$ZCTA), 
                                  MODZCTA = as.character(modzcta_to_zcta$MODZCTA))

# Calculate the proportion of population each combined ZCTA contributes to MODZCTA
modzcta_to_zcta_pop <- modzcta_to_zcta_chr %>%
  left_join(ACS_Data_scaled[, c("GEOID", "total_pop1")], by = c("ZCTA" = "GEOID")) %>% 
  group_by(MODZCTA) %>%
  mutate(sum_pop = sum(total_pop1)) %>%
  ungroup() %>%
  mutate(pop_prop = total_pop1/sum_pop) %>% 
  arrange(MODZCTA)

# Sum ZCTA->Tract RES_RATIO weights when multiple ZCTA combine to a single MODZCTA
modzcta_to_tract <- modzcta_to_zcta_pop %>%
  filter(MODZCTA != "99999") %>% 
  # Weights of each ZCTA are scaled by their proportion of MODZCTA population
  left_join(zip_to_tract, by = c("ZCTA" = "ZIP")) %>%
  mutate(scaled_res_ratio = RES_RATIO * pop_prop) %>% 
  # note: MODZCTA 10001 & 10007 contain a ZCTA with a small share of population
  # but a zero share of RES_RATIO. This makes their sum of scaled_res_ratio by
  # MODZCTA less than 1, but this will not effect the weighted median or 3Q values
  # by tract.
  group_by(MODZCTA, TRACT) %>%
  dplyr::summarize(SUM_RES_RATIO = sum(scaled_res_ratio), .groups = "drop") %>%
  # Weights are then multiplied by the population density of each tract
  left_join(tract_vars, by = c("TRACT" = "GEOID")) %>%
  dplyr::select(MODZCTA, TRACT, SUM_RES_RATIO, total_pop1, total_hholds1) %>%
  filter(total_hholds1 > 0) %>% 
  mutate(tract_popdens = total_pop1/total_hholds1) %>% 
  mutate(W_RES_RATIO = SUM_RES_RATIO * tract_popdens) %>%
  dplyr::select(MODZCTA, TRACT, W_RES_RATIO)
modzcta_to_tract
## # A tibble: 3,020 x 3
##    MODZCTA TRACT       W_RES_RATIO
##    <chr>   <chr>             <dbl>
##  1 10001   36061005600     0.0307 
##  2 10001   36061005800     0.0648 
##  3 10001   36061007100     0      
##  4 10001   36061007600     0.250  
##  5 10001   36061008400     0.00422
##  6 10001   36061009100     0.154  
##  7 10001   36061009300     0.145  
##  8 10001   36061009500     0.231  
##  9 10001   36061009700     0.250  
## 10 10001   36061009900     0.253  
## # … with 3,010 more rows
# length(unique(modzcta_to_tract$MODZCTA)) #  177
# length(unique(modzcta_to_tract$TRACT))   # 2111
# length(unique(tractSF$GEOID))            # 2167

# check res_ratio against tracts with no population
tract_modzcta_pop <- modzcta_to_tract %>%
  left_join(acs_tracts2, by = c("TRACT" = "GEOID")) %>%
  dplyr::select(MODZCTA, TRACT, total_pop1, W_RES_RATIO)

# checking for tracts with no population but res_ratio > 0
# (not possible now that the weighted ratio uses population)
#tract_modzcta_pop %>% filter(total_pop1 == 0 & W_RES_RATIO > 0) %>% nrow() # 0

# check to make sure no MODZCTA have all zeroes for all tract res_ratios
all_zero_ratio <- tract_modzcta_pop %>% group_by(MODZCTA) %>% filter(all(W_RES_RATIO == 0))
if(nrow(all_zero_ratio) > 0){
  warning("The following MODZCTA have all zeroes for weighted tract residents:\n ", 
          paste(all_zero_ratio$MODZCTA, collapse = ","))
} 
modzcta_to_tract2 <- dplyr::select(tract_modzcta_pop, -total_pop1)

### Preparation done --- Now for the Analysis ###

Part 1: Creation of BWQS Neighborhood Infection Risk Scores

#### Part 1: Creation of BWQS Neighborhood Infection Risk Scores ####

# Step 1: Create univariate scatterplots with negative binomial linear smoothers
# to make sure direction of associations are consistent for all variables
# Visualize the non-linear relationship between infection rate and test_ratio (not adjusted for social inequity)
# infections versus testing ratio with smoothers; this covariate is not quantiled so the shape of association is important
ggplot(ZCTA_ACS_COVID, aes(x = testing_ratio, y = pos_per_100000)) + geom_point() +
  ylab("infections per 100,000") +
  geom_smooth(color = "red", formula = y ~ x, method = "glm.nb") +
  stat_smooth(color = "green", method = "gam", formula = y ~ s(x), method.args = list(family = "nb"), se = F) +
  geom_smooth(method = "glm.nb", formula = y ~ splines::ns(x, 3), se = FALSE) + theme_bw()

# 10 social variables
plotsocial <- ggplot(ZCTA_ACS_COVID, aes(y = pos_per_100000)) + 
  geom_point() + geom_smooth(color = "red", formula = y ~ x, method = "glm.nb") + ylab("infections per 100,000") + theme_bw()
plotsocial %+% aes(x = one_over_medincome)

plotsocial %+% aes(x = not_insured)

plotsocial %+% aes(x = one_over_grocers_per_1000)

plotsocial %+% aes(x = unemployed)

plotsocial %+% aes(x = res_vol_popdensity)

plotsocial %+% aes(x = didnot_workhome_commute)

plotsocial %+% aes(x = not_quarantined_jobs)

plotsocial %+% aes(x = avg_hhold_size)

plotsocial %+% aes(x = essentialworker_drove)

plotsocial %+% aes(x = essentialworker_pubtrans)

SES_vars <- names(ZCTA_ACS_COVID %>% dplyr::select(one_over_medincome, not_insured, one_over_grocers_per_1000, unemployed, 
                                                    not_quarantined_jobs, essentialworker_pubtrans, essentialworker_drove, 
                                                    didnot_workhome_commute, res_vol_popdensity, avg_hhold_size))

# Step 2a: Examine relationships between explanatory variables to make sure nothing >0.9 correlation, as this could bias BWQS
Cors_SESVars <- cor(x = ZCTA_ACS_COVID %>% dplyr::select(all_of(SES_vars)), method = "kendall")
Cors_SESVars1 <- as.data.frame(Cors_SESVars)
Cors_SESVars1$var1 <- row.names(Cors_SESVars1)
Cors_SESVars2 <- gather(data = Cors_SESVars1, key = "var2", value = "correlation", -var1) %>%
  filter(var1 != var2) 

Cors_SESVars2 %>% arrange(desc(correlation)) %>% distinct(correlation, .keep_all = T) 
##                        var1                     var2 correlation
## 1     essentialworker_drove     not_quarantined_jobs   0.6121033
## 2  essentialworker_pubtrans       one_over_medincome   0.5432472
## 3                unemployed       one_over_medincome   0.5341512
## 4               not_insured       one_over_medincome   0.5270258
## 5  essentialworker_pubtrans               unemployed   0.5243542
## 6   didnot_workhome_commute    essentialworker_drove   0.5086620
## 7            avg_hhold_size     not_quarantined_jobs   0.4787113
## 8   didnot_workhome_commute     not_quarantined_jobs   0.4763680
## 9            avg_hhold_size  didnot_workhome_commute   0.4598744
## 10       res_vol_popdensity       one_over_medincome   0.4549307
## 11       res_vol_popdensity essentialworker_pubtrans   0.4507468
## 12       res_vol_popdensity              not_insured   0.4391560
## 13           avg_hhold_size    essentialworker_drove   0.4198843
##  [ reached 'max' / getOption("max.print") -- omitted 32 rows ]
if(export.figs) {
  png(filename = file.path(fig.path, paste0("sfig1_", Sys.Date(), ".png")), width = 96*7, height = 96*7)
  corrplot(Cors_SESVars, p.mat = Cors_SESVars, insig = "p-value", type = "lower", 
         sig.level = -1, tl.col = "black", tl.srt = 45)
  dev.off()
}
## png 
##   2

# Step 2b: Examine Univariable kendall associations for all selected variables with the outcome  
bind_cols(Variables = SES_vars,
          
        ZCTA_ACS_COVID %>%
            dplyr::select(all_of(SES_vars), pos_per_100000) %>%
            summarise_at(vars(all_of(SES_vars)), list(~cor(., pos_per_100000, method = "kendall"))) %>%
            t() %>%
            as_tibble(),
        
        ZCTA_ACS_COVID %>%
          dplyr::select(all_of(SES_vars), pos_per_100000) %>%
          summarise_at(vars(all_of(SES_vars)), 
                       list(~cor.test(., pos_per_100000, method = "kendall")$p.value)) %>%
          t() %>%
          as_tibble()) %>%
  
  mutate(`Correlation (Tau)` = round(V1...2, 3),
         `p value` = as.character(ifelse(V1...3 < 0.0001, "< 0.0001", round(V1...3, 3))),) %>%
  dplyr::select(-V1...2, -V1...3) 
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## New names:
## * V1 -> V1...2
## * V1 -> V1...3
## # A tibble: 10 x 3
##    Variables                 `Correlation (Tau)` `p value`
##    <chr>                                   <dbl> <chr>    
##  1 one_over_medincome                      0.322 < 0.0001 
##  2 not_insured                             0.236 < 0.0001 
##  3 one_over_grocers_per_1000               0.268 < 0.0001 
##  4 unemployed                              0.294 < 0.0001 
##  5 not_quarantined_jobs                    0.542 < 0.0001 
##  6 essentialworker_pubtrans                0.175 0.001    
##  7 essentialworker_drove                   0.483 < 0.0001 
##  8 didnot_workhome_commute                 0.453 < 0.0001 
##  9 res_vol_popdensity                      0.106 0.035    
## 10 avg_hhold_size                          0.463 < 0.0001
# Step 3: Prepare data for BWQS and pass to stan for model fitting 
pm(fit_BWQS_model <- function(data_list, stan_model_path){
  fitted_model <- stan(
    file = stan_model_path,
    data = data_list,
    chains = 1,
    warmup = 2500,
    iter = 20000,
    thin = 10,
    refresh = 0,
    algorithm = "NUTS",
    seed = 1234,
    control = list(max_treedepth = 20,
                   adapt_delta = 0.999999999999999))
  return(fitted_model)
})

Compute_Bayes_R2 <- function(fit) {
  y_pred = exp(extract(fit,"eta")$eta)
  var_fit = apply(y_pred, 1, var)
  mean_fit = apply(y_pred, 1, mean)
  phi = extract(fit,"phi")$phi
  var_res = mean_fit  + (mean_fit^2)/phi
  r2 = var_fit / (var_fit + var_res)
  return(list(meanr2 = mean(r2),
              medianr2 = median(r2),
              lowerr2 = quantile(r2, .025),
              upperr2 = quantile(r2, .975)))
}

prep_BWQS_data <- function(df, ses_varnames){
  y = as.numeric(df$pos_per_100000)
  X <- df %>%
    dplyr::select(all_of(ses_varnames))
  K <- ns(df$testing_ratio, df = 3)
  for (vname in ses_varnames)
    X[[vname]] <- ecdf(X[[vname]])(X[[vname]]) * 10
  data <-as.data.frame(cbind(y,X)) # Aggregate data in a data.frame
  
  data_list = list(N      = NROW(data),
                   N_new  = NROW(data),
                   C      = NCOL(X),
                   K      = NCOL(K),
                   XC     = cbind(as.matrix(X)),
                   XK     = cbind(K),
                   XC_new = cbind(as.matrix(X)),
                   XK_new = cbind(K),
                   Dalp   = rep(1,length(ses_varnames)),
                   y      = as.vector(data$y))
  return(list(data_list = data_list, X = X, K = K))
}

m1data <- prep_BWQS_data(ZCTA_ACS_COVID, SES_vars)

t_dataprep = Sys.time()

# fit our primary model -- negative binomial
m1 <- fit_BWQS_model(m1data$data_list, BWQS_stan_model)

t_m1 = Sys.time()

# print m1 for model diagnostics (n_eff as % of 1750, Rhat, trace plots, acf)
# m1
min(summary(m1)$summary[,"n_eff"]/1750) # effective sample size is at worst 78%
## [1] 0.6186883
traceplot(m1, pars = c("beta1", "W"))

stan_ac(m1, pars = c("beta1", "W"))
## Warning: Ignoring unknown parameters: fun.y
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

extract_waic(m1)
## $waic
##     waic 
## 2369.804 
## 
## $elpd_waic
## elpd_waic 
## -1184.902 
## 
## $p_waic
##   p_waic 
## 14.69814 
## 
## $elpd_loo
##  elpd_loo 
## -1184.963 
## 
## $p_loo
##    p_loo 
## 14.75942
round(Compute_Bayes_R2(m1)$meanr2, 2)
## [1] 0.93
round(Compute_Bayes_R2(m1)$lowerr2, 2)
## 2.5% 
## 0.92
round(Compute_Bayes_R2(m1)$upperr2, 2)
## 97.5% 
##  0.95
# residual analysis with DHARMa
residuals_qq <- createDHARMa(simulatedResponse = t(extract(m1,"y_new")$y_new), observedResponse = m1data$data_list$y)
## No fitted predicted response provided, using the mean of the simulations
#plotQQunif(residuals_qq, testOutliers = F, testDispersion = F) #base graphics
ks_unif <- testUniformity(residuals_qq)
residuals_qq_unif <- gap::qqunif(residuals_qq$scaledResiduals,pch=2,bty="n", logscale = F, col = "black", cex = 0.6, main = "QQ plot residuals", cex.main = 1)
sfig3_qq <- ggplot(data.frame(x = residuals_qq_unif$x, y = residuals_qq_unif$y)) +
  geom_abline(linetype = "dashed", color = "blue") +
  scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
  geom_point(aes(x, y), size = 0.8) +
  annotate("text", x = 0.02, y = 0.9, hjust = 0, size = 3.2,
           label = paste0("Kolmogorov-Smirnov test for\ncomparison of distributions: p=",
                          round(ks_unif$p.value, 2))) +
  coord_equal(ratio = 1) +
  labs(x = "Expected", y = "Observed") +
  theme_bw() + theme(plot.margin = unit(c(5.5, 11, 5.5, 5.5), "points"))
if(export.figs) ggsave(filename = file.path(fig.path, paste0("sfig3_", Sys.Date(), ".png")), width = 3.5, height = 3.4)

# examine parameter estimates
exp(mean(extract(m1, "beta1")$beta1))
## [1] 1.076975
vars = c("phi", "beta0", "beta1", paste0("delta", 1:3), SES_vars)
parameters_to_drop <- c("phi", paste0("delta", 1:3), "beta0", "beta1")
number_of_coefficients <- length(vars)

BWQS_params <- bind_cols(as_tibble(summary(m1)$summary[1:number_of_coefficients,c(1,4,6,8:10)]), label = vars)

BWQS_params %>% filter(label == "beta1") %>% mutate_at(vars(1:3), ~exp(.))
## # A tibble: 1 x 7
##    mean `2.5%` `50%` `97.5%` n_eff  Rhat label
##   <dbl>  <dbl> <dbl>   <dbl> <dbl> <dbl> <chr>
## 1  1.08   1.06  1.08  0.0855 1631.  1.00 beta1
BWQS_fits <- BWQS_params %>%
  rename(lower = "2.5%",
         upper = "97.5%") %>%
  filter(!label %in% parameters_to_drop) %>%
  arrange(desc(mean)) %>%
  mutate(group = factor(if_else(label == "one_over_medincome"|label =="not_insured"|label == "unemployed", "Finances &\nAccess to care",
                         if_else(label == "one_over_grocers_per_1000", "Food\nAccess",
                                 if_else(str_detect(label, "essential")|label == "not_quarantined_jobs"|label=="didnot_workhome_commute", "Commuting and\nEssential Work",
                                         if_else(label == "avg_hhold_size"|label == "res_vol_popdensity", "Population\nDensity", "Unmatched")))),
                        levels = c("Commuting and\nEssential Work", "Finances &\nAccess to care", "Population\nDensity", "Food\nAccess")))

labels1 <- c("one_over_medincome" = "1/\nMedian income", 
             "not_insured" = "Uninsured", 
             "unemployed" = "Unemployed", 
             "one_over_grocers_per_1000" = "1/\nGrocers per 1000",
             "not_quarantined_jobs" = "Essential Workers", 
             "essentialworker_pubtrans" = "Essential Worker:\nPublic Transit", 
             "essentialworker_drove" = "Essential Worker:\nDriving Commute", 
             "didnot_workhome_commute" = "1/\nWork from home", 
             "res_vol_popdensity" = "Population Density\nby Residential Volume", 
             "avg_hhold_size" = "Average people\nper household")

labels2 <- c("phi" = "Overdispersion", 
             "beta0" = "Intercept", 
             "beta1" = "BWQS term",
             "delta1" = "Testing ratio: spline term 1",
             "delta2" = "Testing ratio: spline term 2",
             "delta3" = "Testing ratio: spline term 3",
             labels1)

# Supplemental Table 2: Parameter estimates, credible intervals, and diagnostics from main BWQS infections model
BWQS_params %>% bind_cols(., "terms" = labels2) %>%
  dplyr::select(-label) %>%
  mutate_at(vars(1:6), ~round(., 3)) %>%
  mutate_at(vars(5), ~round(., 0)) %>% 
  mutate(`95% CrI`=paste0("(",format(`2.5%`, nsmall=3),", ",format(`97.5%`, nsmall=3),")")) %>% 
  mutate(terms = 
           case_when(terms=="BWQS term" ~ "COVID-19 inequity index",
                     TRUE               ~ terms)) %>%
  dplyr::select(-`2.5%`, -`97.5%`) %>%
  rename(median=`50%`) %>%
  dplyr::select(terms, mean, `95% CrI`, median, Rhat, n_eff) %>%
  kbl(align=rep('r', 6), font_size = 9.5) %>%
  kable_classic(full_width = F, html_font = "Arial")
terms mean 95% CrI median Rhat n_eff
Overdispersion 104.271 (81.793, 129.364) 103.650 1.000 1730
Intercept 6.261 ( 6.177, 6.344) 6.261 0.999 1663
COVID-19 inequity index 0.074 ( 0.063, 0.086) 0.074 1.001 1631
Testing ratio: spline term 1 0.983 ( 0.893, 1.072) 0.985 1.000 1781
Testing ratio: spline term 2 2.029 ( 1.821, 2.244) 2.025 1.000 1539
Testing ratio: spline term 3 1.121 ( 1.002, 1.233) 1.121 0.999 1546
1/ Median income 0.105 ( 0.009, 0.231) 0.100 1.000 1638
Uninsured 0.177 ( 0.056, 0.302) 0.176 1.000 1505
Unemployed 0.027 ( 0.001, 0.076) 0.022 1.000 1672
1/ Grocers per 1000 0.055 ( 0.003, 0.143) 0.049 1.000 1660
Essential Workers 0.062 ( 0.002, 0.173) 0.052 0.999 1528
Essential Worker: Public Transit 0.050 ( 0.002, 0.138) 0.042 0.999 1771
Essential Worker: Driving Commute 0.166 ( 0.035, 0.289) 0.167 1.000 1790
1/ Work from home 0.089 ( 0.009, 0.193) 0.086 1.000 1706
Population Density by Residential Volume 0.108 ( 0.013, 0.210) 0.107 0.999 1798
Average people per household 0.161 ( 0.032, 0.297) 0.158 0.999 1715
# create a figure with parameter estimates for the weights
fig2 <- ggplot(data=BWQS_fits, aes(x= reorder(label, mean), y=mean, ymin=lower, ymax=upper)) +
  geom_pointrange() + 
  coord_flip() + 
  xlab("") + 
  ylab("Mean (95% credible intervals)") +
  scale_x_discrete(labels = labels1) + 
  theme_set(theme_bw(base_size = 18)) +
  facet_grid(group~., scales = "free", space = "free") +
  theme(strip.text.x = element_text(size = 14))
if(export.figs) ggsave(fig2, filename = file.path(fig.path, paste0("fig2", "_", Sys.Date(),".png")), 
                       dpi = 300, width = 8, height = 8)

# Step 4: Construct the summative COVID-19 inequity index value for each ZCTA
# Use the variable-specific weight on the quantiled splits to create a 10 point ZCTA-level infection risk score  
BWQS_weights <- as.numeric(summary(m1)$summary[(length(vars)-length(SES_vars) + 1):number_of_coefficients,c(1)])

ZCTA_ACS_COVID2 <- m1data$X * BWQS_weights[col(ZCTA_ACS_COVID)] 

BWQS_index <- ZCTA_ACS_COVID2 %>% 
  dplyr::mutate(BWQS_index = rowSums(.)) %>% 
  dplyr::select(BWQS_index) 

# Visualize the relationship between BWQS and test_ratio
ggplot(data.frame(BWQS_index = BWQS_index$BWQS_index, testing_ratio = ZCTA_ACS_COVID$testing_ratio), aes(testing_ratio, BWQS_index)) + geom_point() + 
  geom_smooth() + geom_smooth(color = "red", method = "lm")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# BWQS is correlated with testing_ratio
cor.test(BWQS_index$BWQS_index, ZCTA_ACS_COVID$testing_ratio, method = "spearman")  
## 
##  Spearman's rank correlation rho
## 
## data:  BWQS_index$BWQS_index and ZCTA_ACS_COVID$testing_ratio
## S = 472628, p-value = 1.969e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4885952
# Partial regression plot for BWQS index
# shows a nice linear relationship of BWQS index with infections after adjustment for testing
nb_testing_ns3df <- glm.nb(m1data$data_list$y ~ m1data$K)
yresid <- resid(nb_testing_ns3df)
bwqs_testing_ns3df <- lm(BWQS_index$BWQS_index ~ m1data$K)
bwqsresid <- resid(bwqs_testing_ns3df)
ggplot(data.frame(yresid, bwqsresid), aes(bwqsresid, yresid)) + geom_point() + 
  geom_smooth(formula = y ~ x, method = "lm", se = F) + 
  ylab("residual log infection rate\n(adjusted for testing)") + xlab("residual BWQS infection risk index\n(adjusted for testing)")

summary(lm(yresid ~ bwqsresid))
## 
## Call:
## lm(formula = yresid ~ bwqsresid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.61379 -0.36053  0.01232  0.32793  1.95081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05117    0.04794  -1.067    0.287    
## bwqsresid    0.48714    0.03001  16.230   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6378 on 175 degrees of freedom
## Multiple R-squared:  0.6008, Adjusted R-squared:  0.5986 
## F-statistic: 263.4 on 1 and 175 DF,  p-value: < 2.2e-16
rm(nb_testing_ns3df, yresid, bwqs_testing_ns3df, bwqsresid)

# Visualize the full model observed vs predicted -- shows a close relationship (which is why R2 is so high)
preddf <- data.frame(data.frame(pred = colMeans(extract(m1,"y_new")$y_new), y = m1data$data_list$y))
ggplot(preddf, aes(pred, y)) + 
  geom_point() +  
  geom_abline(linetype = "dashed", color = "grey10") + 
  coord_fixed(ratio = 1) + 
  scale_x_continuous("Predicted Infections per 100,000", label=comma, limits = range(preddf)) + 
  scale_y_continuous("Infections per 100,000", label=comma) + 
  theme_bw()

rm(preddf)

# calculate credible interval over the mean predicted infections
# at the median testing_ratio
sim_out <- data.frame(extract(m1, pars = c("beta0", "beta1", "delta")))
median_testing <- predict(m1data$K, median(ZCTA_ACS_COVID$testing_ratio))
# calculate term for median testing_rate
sim_out$deltamedian <- with(sim_out, median_testing[1] * delta.1 + 
                              median_testing[2] * delta.2 + 
                              median_testing[3] * delta.3)
# make a sequence of BWQS values
xseqlength <- 300
bwqs_seq <- seq(from = min(BWQS_index$BWQS_index), 
                to = max(BWQS_index$BWQS_index), 
                length.out = xseqlength)
sim_matrix <- sim_out$beta0 %o% rep(1, xseqlength) + 
  sim_out$beta1 %o% bwqs_seq + 
  sim_out$deltamedian%o% rep(1, xseqlength)
sim_bwqs_df <- data.frame(bwqs_seq, 
                     lower = exp(colQuantiles(sim_matrix, probs = 0.025)),
                     upper = exp(colQuantiles(sim_matrix, probs = 0.975)),
                     mean = exp(colMeans(sim_matrix)))
rm(median_testing, xseqlength, bwqs_seq, sim_matrix)

# calculate credible interval over the mean predicted infections
# at the median COVID-19 inequity index
sim_out$beta1median <- sim_out$beta1 * median(BWQS_index$BWQS_index)
# make a sequence of testing_ratio values
xseqlength <- 300
inequity_seq <- data.frame(x = seq(from = min(ZCTA_ACS_COVID$testing_ratio),
                                   to = max(ZCTA_ACS_COVID$testing_ratio),
                                   length.out = xseqlength))
inequity_seq <- data.frame(x = inequity_seq$x, t(sapply(inequity_seq$x, FUN = function(x) predict(m1data$K, x))))
sim_matrix <- 
  sim_out$beta0 %o% rep(1, xseqlength) + 
  sim_out$beta1median %o% rep(1, xseqlength) + 
  sim_out$delta.1 %o% inequity_seq$X1  + 
  sim_out$delta.2 %o% inequity_seq$X2  + 
  sim_out$delta.3 %o% inequity_seq$X3
sim_testing_df <- data.frame(x = inequity_seq$x, 
                     lower = exp(colQuantiles(sim_matrix, probs = 0.025)),
                     upper = exp(colQuantiles(sim_matrix, probs = 0.975)),
                     mean = exp(colMeans(sim_matrix)))
rm(sim_out, xseqlength, inequity_seq, sim_matrix)

# Visualize the relationship between BWQS index and infection rate at the median testing_ratio
BWQS_scatter <- ggplot(sim_bwqs_df, aes(x = bwqs_seq)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = 'grey75') + 
  geom_line(aes(y = mean)) + 
  geom_point(data = data.frame(BWQS_index, y = m1data$data_list$y), aes(BWQS_index, y), alpha = 0.5) + 
  scale_x_continuous("COVID-19 inequity index") + 
  scale_y_continuous("Infections per 100,000", label=comma)
BWQS_scatter <- ggExtra::ggMarginal(BWQS_scatter, type = "histogram", fill = "grey40", margins = "both", 
                                    xparams = list(binwidth = 0.5), yparams = list(binwidth = 200))
if(export.figs) {
  png(filename = file.path(fig.path, paste0("fig1_", Sys.Date(), ".png")), width = 96*5, height = 96*5)
  print(BWQS_scatter)
  dev.off()
}
## png 
##   2

# Visualize the relationship between testing_ratio and infection rate at the median BWQS
testing_scatter <- ggplot(sim_testing_df, aes(x = x)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = 'grey75') + 
  geom_line(aes(y = mean)) + 
  geom_point(data = data.frame(testing_ratio = ZCTA_ACS_COVID$testing_ratio, y = m1data$data_list$y), 
             aes(testing_ratio, y), alpha = 0.5) + 
  scale_x_continuous("Testing ratio") + 
  scale_y_continuous("Infections per 100,000", label=comma)
testing_scatter <- ggExtra::ggMarginal(testing_scatter, type = "histogram", fill = "grey40", margins = "both",
                                       xparams = list(binwidth = 0.005), yparams = list(binwidth = 200))
if(export.figs) {
  png(filename = file.path(fig.path, paste0("sfig2_", Sys.Date(), ".png")), width = 96*5, height = 96*5)
  print(testing_scatter)
  dev.off()
}
## png 
##   2

# Step 5: Visualize the spatial distribution of ZCTA-level infection risk scores 

ZCTA_BWQS_COVID_shp <- ZCTA_ACS_COVID_shp %>%
  bind_cols(., BWQS_index)

# reproject to WGS84 to be compatible with scalebar

ZCTA_BWQS_COVID_shp <- st_transform(ZCTA_BWQS_COVID_shp, 4326)

# Figure 3A - BWQS Index by ZCTA
fig3a <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = ZCTA_BWQS_COVID_shp, aes(fill = BWQS_index), lwd = 0.2) + 
  scalebar(ZCTA_BWQS_COVID_shp, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5, st.dist = 0.011,
           anchor = c(x = -73.71, y = 40.49)) + 
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
            expand = FALSE) +
  theme_bw(base_size = 6) + 
  labs(fill = "COVID-19 Inequity Index") +
  theme(legend.title = element_text(face = "bold", size = 9), 
        panel.background = element_rect(fill = "#cccccc"), 
        legend.background = element_rect(fill = "transparent"),
        panel.grid = element_blank(),
        legend.position = c(0.125, 0.90),
        legend.text = element_text(size = 8.5),
        legend.key.size = unit(1.1, "lines"),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank())

# Step 6: Compare quantile distribution of ZCTA-level BWQS scores by the race/ethnic composition of residents  
Demographics <- ACS_Data_scaled %>% 
  dplyr::select(GEOID, ends_with("_raceethnic"), total_pop1) %>%
  left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
  group_by(MODZCTA) %>%
  summarise_at(vars(ends_with("_raceethnic"), total_pop1), ~sum(.)) %>%
  mutate(zcta = as.character(MODZCTA),
         other_raceethnic = total_pop1 - (hisplat_raceethnic + nonhispLat_white_raceethnic + nonhispLat_black_raceethnic + 
                                            nonhispLat_amerindian_raceethnic + nonhispLat_asian_raceethnic))
  
ZCTA_BQWS <- ZCTA_BWQS_COVID_shp %>%
  st_set_geometry(., NULL) %>%
  dplyr::select(zcta, BWQS_index)

Demographics_for_ridges <- Demographics %>%
  left_join(., ZCTA_BQWS, by = "zcta") %>%
  dplyr::select(-total_pop1) %>%
  gather(key = "Race/Ethnicity", value = "Population", hisplat_raceethnic:other_raceethnic) %>%
  mutate(`Race/Ethnicity` = if_else(`Race/Ethnicity`=="hisplat_raceethnic","Hispanic/Latinx",
                                    if_else(`Race/Ethnicity`=="nonhispLat_black_raceethnic", "Black",
                                            if_else(`Race/Ethnicity`=="nonhispLat_white_raceethnic", "White",
                                                    if_else(`Race/Ethnicity`== "nonhispLat_asian_raceethnic", "Asian", "Other")))),
         `Race/Ethnicity` = factor(`Race/Ethnicity`, levels = c( "White",  "Asian", "Other","Hispanic/Latinx","Black")),
         Population = as.numeric(Population)) 

(Demographics_for_ridges %>%
  group_by(`Race/Ethnicity`) %>%
  summarise(weighted.mean(BWQS_index, Population),
            weightedMedian(BWQS_index, Population)))
## # A tibble: 5 x 3
##   `Race/Ethnicity` `weighted.mean(BWQS_index, Po… `weightedMedian(BWQS_index, P…
##   <fct>                                     <dbl>                          <dbl>
## 1 White                                      4.34                           4.71
## 2 Asian                                      5.48                           6.20
## 3 Other                                      5.14                           5.67
## 4 Hispanic/Latinx                            6.10                           6.60
## 5 Black                                      6.26                           6.51
fig4 <- ggplot(Demographics_for_ridges,
       aes(x = BWQS_index, y = `Race/Ethnicity`)) + 
  xlab("COVID-19 inequity index")+
  geom_density_ridges(
    aes(height = ..density..,  
        weight = Population / sum(Population)),
    scale = 0.95,
    stat ="density") + 
  scale_y_discrete(expand = c(0, 0)) + 
  expand_limits(y = 6) + 
  theme(legend.position = "none")
if(export.figs) ggsave(plot = fig4, filename = file.path(fig.path, paste0("fig4","_",Sys.Date(),".png")), 
                       dpi = 300, device = "png", width = 8, height = 5)

Below_25th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index<quantile(BWQS_index, .25))
Race_Ethncity_below25th <- Demographics %>%
  filter(zcta %in% Below_25th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2))) %>%
  mutate(Group = "Below 25th percentile BWQS")

Between_25thand75th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index>quantile(BWQS_index, .25) & BWQS_index<quantile(BWQS_index, .75))

Race_Ethncity_btw_25th75th <- Demographics %>%
  filter(zcta %in% Between_25thand75th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2)))%>%
  mutate(Group = "Between 25-75th percentile BWQS")

Above_75th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index>quantile(BWQS_index, .75))

Race_Ethncity_above_75th <- Demographics %>%
  filter(zcta %in% Above_75th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2))) %>%
  mutate(Group = "Above 75th percentile BWQS")

Race_Ethncity_all <- Demographics %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2)))%>%
  mutate(Group = "NYC Population")

Demographics_by_BWQS <- bind_rows(Race_Ethncity_all, Race_Ethncity_below25th, Race_Ethncity_btw_25th75th, Race_Ethncity_above_75th) %>%
  mutate(Other = other_raceethnic + nonhispLat_amerindian_raceethnic)  %>%
  dplyr::select(-other_raceethnic, -nonhispLat_amerindian_raceethnic, - total_pop1) %>%
  rename("Hispanic/Latinx" = "hisplat_raceethnic",
         "Black" = "nonhispLat_black_raceethnic", 
          "White" = "nonhispLat_white_raceethnic", 
         "Asian" = "nonhispLat_asian_raceethnic") %>%
  gather(., "Race/Ethnicity", "Proportion", 1:4,6) %>%
  mutate(`Race/Ethnicity` = factor(`Race/Ethnicity`, levels = c("Other", "Asian", "White", "Hispanic/Latinx", "Black")),
         Group = factor(Group, levels = c("NYC Population", "Below 25th percentile BWQS", "Between 25-75th percentile BWQS", "Above 75th percentile BWQS")))

labels_demographics <- c("NYC Population" = "NYC Population", "Below 25th percentile BWQS" = "Below 25th\npercentile BWQS", 
                         "Between 25-75th percentile BWQS" = "Between 25-75th\npercentile BWQS", "Above 75th percentile BWQS" = "Above 75th\npercentile BWQS")
labels_demographics <- c("NYC Population" = "NYC Population", "Below 25th percentile BWQS" = "Below 25th\n%ile index", 
                         "Between 25-75th percentile BWQS" = "Between 25-75th\n%ile index", "Above 75th percentile BWQS" = "Above 75th\n%ile index")

sfig4 <- ggplot(Demographics_by_BWQS, aes(fill=`Race/Ethnicity`, y=Proportion, x=Group)) + 
    geom_rect(data = subset(Demographics_by_BWQS, Group=="NYC Population"), 
            aes(xmin=as.numeric(Group)-.35,xmax=as.numeric(Group)+.35, ymin=0, ymax=100, fill="gray85"), color = "gray", alpha = .1) +
  geom_bar(data = subset(Demographics_by_BWQS, Group!="NYC Population"), position="stack", stat="identity", width = .75) +
  geom_bar(data = subset(Demographics_by_BWQS, Group=="NYC Population"), position="stack", stat="identity", width = .45) +
  scale_fill_manual(breaks = c("Other", "Asian", "White", "Hispanic/Latinx", "Black"), 
                    values = c("#984ea3","#ff7f00","gray85", "#4daf4a", "#e41a1c", "#377eb8"))  +
  geom_text(aes(label=ifelse(Proportion >= 5, paste0(sprintf("%.0f", Proportion),"%"),"")),
            position=position_stack(vjust=0.5), colour="black", size = 8) + 
  scale_y_continuous("Percentage", expand = c(0,0), labels = function(x) paste0(x, "%")) + 
  scale_x_discrete(limits = c( "NYC Population", "Below 25th percentile BWQS", "Between 25-75th percentile BWQS", "Above 75th percentile BWQS"), 
                   labels = labels_demographics) + 
  theme_bw(base_size = 16) +
  theme(legend.text = element_text(size = 14),
        axis.text.y = element_text(size = 16),
        axis.text.x = element_text(size = 16, color = "black"), 
        axis.title.x = element_blank()) 
if(export.figs) ggsave(sfig4, filename = file.path(fig.path, paste0("sfig4","_",Sys.Date(),".png")), device = "png",
                       dpi = 300, width = 12, height = 6)

t_postm1 = Sys.time()

BWQS by Tract

#### BWQS by Tract ####

get_tract_vars_by_zcta <- function(tract_vars, modzcta_tract_crosswalk, whichq){
  whichq = str_to_lower(whichq)
  if(whichq %in% c("median", "q2", "2q", "2")){
    qname = "median"
    qnum = 2
  } else if(whichq %in% c("q3", "3q", "3")){
    qname = "3Q"
    qnum = 3
  } else { stop("unhandled quartile selected: ", whichq)}
  
  ses_zcta <- tract_vars %>% 
    st_set_geometry(., NULL) %>%
    left_join(., modzcta_tract_crosswalk, by = c("GEOID" = "TRACT")) %>% 
    filter(!is.na(MODZCTA)) %>% 
    group_by(MODZCTA) %>%
    summarise(essentialworker_drove_rename = Hmisc::wtd.quantile(essentialworker_drove, W_RES_RATIO)[[qnum]],
              essentialworker_pubtrans_rename = Hmisc::wtd.quantile(essentialworker_pubtrans, W_RES_RATIO)[[qnum]],
              not_quarantined_jobs_rename = Hmisc::wtd.quantile(not_quarantined_jobs, W_RES_RATIO)[[qnum]],
              didnot_workhome_commute_rename = Hmisc::wtd.quantile(didnot_workhome_commute, W_RES_RATIO)[[qnum]],
              not_insured_rename = Hmisc::wtd.quantile(not_insured, W_RES_RATIO)[[qnum]],
              one_over_medincome_rename = Hmisc::wtd.quantile(one_over_medincome, W_RES_RATIO)[[qnum]],
              unemployed_rename = Hmisc::wtd.quantile(unemployed, W_RES_RATIO)[[qnum]],
              avg_hhold_size_rename = Hmisc::wtd.quantile(avg_hhold_size, W_RES_RATIO)[[qnum]],
              res_vol_popdensity_rename = Hmisc::wtd.quantile(res_vol_popdensity, W_RES_RATIO)[[qnum]],
              one_over_grocers_per_1000_rename = Hmisc::wtd.quantile(one_over_grocers_per_1000, W_RES_RATIO)[[qnum]]) 
  
  ses_zcta %>% rename_with(~ gsub("rename", qname, .x))
}

# Select median tract values by MODZCTA, weighted by residential address share and population density by tract
SES_zcta_median <- get_tract_vars_by_zcta(tract_vars, modzcta_to_tract2, "median")
SES_vars_median = names(SES_zcta_median)[names(SES_zcta_median) != "MODZCTA"]

# join SES to testing data: positive/100k and testing_ratio
SES_zcta_median_testing <- ZCTA_ACS_COVID %>%
  dplyr::select(zcta, pos_per_100000, testing_ratio) %>%
  left_join(SES_zcta_median, by = c("zcta" = "MODZCTA"))

# BWQS using median at the tract level 
m2data_median <- prep_BWQS_data(SES_zcta_median_testing, SES_vars_median)
m2_median <- fit_BWQS_model(data_list = m2data_median$data_list, stan_model_path = BWQS_stan_model)
## Warning in validityMethod(object): The following variables have undefined
## values: log_lik[1],The following variables have undefined values: log_lik[2],The
## following variables have undefined values: log_lik[3],The following variables
## have undefined values: log_lik[4],The following variables have undefined
## values: log_lik[5],The following variables have undefined values: log_lik[6],The
## following variables have undefined values: log_lik[7],The following variables
## have undefined values: log_lik[8],The following variables have undefined values:
## log_lik[9],The following variables have undefined values: log_lik[10],The
## following variables have undefined values: log_lik[11],The following
## variables have undefined values: log_lik[12],The following variables have
## undefined values: log_lik[13],The following variables have undefined values:
## log_lik[14],The following variables have undefined values: log_lik[15],The
## following variables have undefined values: log_lik[16],The following
## variables have undefined values: log_lik[17],The following variables have
## undefined values: log_lik[18],The following variables have undefined values:
## log_lik[19],The following variables have undefined values: log_lik[20],The
## following variables have undefined values: log_lik[21],The following
## variables have undefined values: log_lik[22],The following variables have
## undefined values: log_lik[23],The following variables have undefined values:
## log_lik[24],The following variables have undefined values: log_lik[25],The
## following variables have undefined values: log_lik[26],The following
## variables have undefined values: log_lik[27],The following variables have
## undefined values: log_lik[28],The following variables have undefined values:
## log_lik[29],The following variables have undefined values: log_lik[30],The
## following variables have undefined values: log_lik[31],The following
## variables have undefined values: log_lik[32],The following variables have
## undefined values: log_lik[33],The following variables have undefined values:
## log_lik[34],The following variables have undefined values: log_lik[35],The
## following variables have undefined values: log_lik[36],The following
## variables have undefined values: log_lik[37],The following variables have
## undefined values: log_lik[38],The following variables have undefined values:
## log_lik[39],The following variables have undefined values: log_lik[40],The
## following variables have undefined values: log_lik[41],The following
## variables have undefined values: log_lik[42],The following variables have
## undefined values: log_lik[43],The following variables have undefined values:
## log_lik[44],The following variables have undefined values: log_lik[45],The
## following variables have undefined values: log_lik[46],The following
## variables have undefined values: log_lik[47],The following variables have
## undefined values: log_lik[48],The following variables have undefined values:
## log_lik[49],The following variables have undefined values: log_lik[50],The
## following variables have undefined values: log_lik[51],The following
## variables have undefined values: log_lik[52],The following variables have
## undefined values: log_lik[53],The following variables have undefined values:
## log_lik[54],The following variables have undefined values: log_lik[55],The
## following variables have undefined values: log_lik[56],The following
## variables have undefined values: log_lik[57],The following variables have
## undefined values: log_lik[58],The following variables have undefined values:
## log_lik[59],The following variables have undefined values: log_lik[60],The
## following variables have undefined values: log_lik[61],The following
## variables have undefined values: log_lik[62],The following variables have
## undefined values: log_lik[63],The following variables have undefined values:
## log_lik[64],The following variables have undefined values: log_lik[65],The
## following variables have undefined values: log_lik[66],The following
## variables have undefined values: log_lik[67],The following variables have
## undefined values: log_lik[68],The following variables have undefined values:
## log_lik[69],The following variables have undefined values: log_lik[70],The
## following variables have undefined values: log_lik[71],The following
## variables have undefined values: log_lik[72],The following variables have
## undefined values: log_lik[73],The following variables have undefined values:
## log_lik[74],The following variables have undefined values: log_lik[75],The
## following variables have undefined values: log_lik[76],The following
## variables have undefined values: log_lik[77],The following variables have
## undefined values: log_lik[78],The following variables have undefined values:
## log_lik[79],The following variables have undefined values: log_lik[80],The
## following variables have undefined values: log_lik[81],The following
## variables have undefined values: log_lik[82],The following variables have
## undefined values: log_lik[83],The following variables have undefined values:
## log_lik[84],The following variables have undefined values: log_lik[85],The
## following variables have undefined values: log_lik[86],The following
## variables have undefined values: log_lik[87],The following variables have
## undefined values: log_lik[88],The following variables have undefined values:
## log_lik[89],The following variables have undefined values: log_lik[90],The
## following variables have undefined values: log_lik[91],The following
## variables have undefined values: log_lik[92],The following variables have
## undefined values: log_lik[93],The following variables have undefined values:
## log_lik[94],The following variables have undefined values: log_lik[95],The
## following variables have undefined values: log_lik[96],The following
## variables have undefined values: log_lik[97],The following variables have
## undefined values: log_lik[98],The following variables have undefined values:
## log_lik[99],The following variables have undefined values: log_lik[100],The
## following variables have undefined values: log_lik[101],The following
## variables have undefined values: log_lik[102],The following variables have
## undefined values: log_lik[103],The following variables have undefined values:
## log_lik[104],The following variables have undefined values: log_lik[105],The
## following variables have undefined values: log_lik[106],The following
## variables have undefined values: log_lik[107],The following variables have
## undefined values: log_lik[108],The following variables have undefined values:
## log_lik[109],The following variables have undefined values: log_lik[110],The
## following variables have undefined values: log_lik[111],The following
## variables have undefined values: log_lik[112],The following variables have
## undefined values: log_lik[113],The following variables have undefined values:
## log_lik[114],The following variables have undefined values: log_lik[115],The
## following variables have undefined values: log_lik[116],The following
## variables have undefined values: log_lik[117],The following variables have
## undefined values: log_lik[118],The following variables have undefined values:
## log_lik[119],The following variables have undefined values: log_lik[120],The
## following variables have undefined values: log_lik[121],The following
## variables have undefined values: log_lik[122],The following variables have
## undefined values: log_lik[123],The following variables have undefined values:
## log_lik[124],The following variables have undefined values: log_lik[125],The
## following variables have undefined values: log_lik[126],The following
## variables have undefined values: log_lik[127],The following variables have
## undefined values: log_lik[128],The following variables have undefined values:
## log_lik[129],The following variables have undefined values: log_lik[130],The
## following variables have undefined values: log_lik[131],The following
## variables have undefined values: log_lik[132],The following variables have
## undefined values: log_lik[133],The following variables have undefined values:
## log_lik[134],The following variables have undefined values: log_lik[135],The
## following variables have undefined values: log_lik[136],Th
t_m2 = Sys.time()

# Select third quartile tract values by MODZCTA, weighted by residential address share and population density by tract
SES_zcta_3q <- get_tract_vars_by_zcta(tract_vars, modzcta_to_tract2, "3q")
SES_vars_3q = names(SES_zcta_3q)[names(SES_zcta_3q) != "MODZCTA"]
SES_zcta_3q_testing <- ZCTA_ACS_COVID %>%
  dplyr::select(zcta, pos_per_100000, testing_ratio) %>%
  left_join(SES_zcta_3q, by = c("zcta" = "MODZCTA"))

#BWQS using the 3rd quartile at the tract level 
m2data_3q <- prep_BWQS_data(SES_zcta_3q_testing, SES_vars_3q)
m2_3q <- fit_BWQS_model(data_list = m2data_3q$data_list, stan_model_path = BWQS_stan_model)
## Warning in validityMethod(object): The following variables have undefined
## values: log_lik[1],The following variables have undefined values: log_lik[2],The
## following variables have undefined values: log_lik[3],The following variables
## have undefined values: log_lik[4],The following variables have undefined
## values: log_lik[5],The following variables have undefined values: log_lik[6],The
## following variables have undefined values: log_lik[7],The following variables
## have undefined values: log_lik[8],The following variables have undefined values:
## log_lik[9],The following variables have undefined values: log_lik[10],The
## following variables have undefined values: log_lik[11],The following
## variables have undefined values: log_lik[12],The following variables have
## undefined values: log_lik[13],The following variables have undefined values:
## log_lik[14],The following variables have undefined values: log_lik[15],The
## following variables have undefined values: log_lik[16],The following
## variables have undefined values: log_lik[17],The following variables have
## undefined values: log_lik[18],The following variables have undefined values:
## log_lik[19],The following variables have undefined values: log_lik[20],The
## following variables have undefined values: log_lik[21],The following
## variables have undefined values: log_lik[22],The following variables have
## undefined values: log_lik[23],The following variables have undefined values:
## log_lik[24],The following variables have undefined values: log_lik[25],The
## following variables have undefined values: log_lik[26],The following
## variables have undefined values: log_lik[27],The following variables have
## undefined values: log_lik[28],The following variables have undefined values:
## log_lik[29],The following variables have undefined values: log_lik[30],The
## following variables have undefined values: log_lik[31],The following
## variables have undefined values: log_lik[32],The following variables have
## undefined values: log_lik[33],The following variables have undefined values:
## log_lik[34],The following variables have undefined values: log_lik[35],The
## following variables have undefined values: log_lik[36],The following
## variables have undefined values: log_lik[37],The following variables have
## undefined values: log_lik[38],The following variables have undefined values:
## log_lik[39],The following variables have undefined values: log_lik[40],The
## following variables have undefined values: log_lik[41],The following
## variables have undefined values: log_lik[42],The following variables have
## undefined values: log_lik[43],The following variables have undefined values:
## log_lik[44],The following variables have undefined values: log_lik[45],The
## following variables have undefined values: log_lik[46],The following
## variables have undefined values: log_lik[47],The following variables have
## undefined values: log_lik[48],The following variables have undefined values:
## log_lik[49],The following variables have undefined values: log_lik[50],The
## following variables have undefined values: log_lik[51],The following
## variables have undefined values: log_lik[52],The following variables have
## undefined values: log_lik[53],The following variables have undefined values:
## log_lik[54],The following variables have undefined values: log_lik[55],The
## following variables have undefined values: log_lik[56],The following
## variables have undefined values: log_lik[57],The following variables have
## undefined values: log_lik[58],The following variables have undefined values:
## log_lik[59],The following variables have undefined values: log_lik[60],The
## following variables have undefined values: log_lik[61],The following
## variables have undefined values: log_lik[62],The following variables have
## undefined values: log_lik[63],The following variables have undefined values:
## log_lik[64],The following variables have undefined values: log_lik[65],The
## following variables have undefined values: log_lik[66],The following
## variables have undefined values: log_lik[67],The following variables have
## undefined values: log_lik[68],The following variables have undefined values:
## log_lik[69],The following variables have undefined values: log_lik[70],The
## following variables have undefined values: log_lik[71],The following
## variables have undefined values: log_lik[72],The following variables have
## undefined values: log_lik[73],The following variables have undefined values:
## log_lik[74],The following variables have undefined values: log_lik[75],The
## following variables have undefined values: log_lik[76],The following
## variables have undefined values: log_lik[77],The following variables have
## undefined values: log_lik[78],The following variables have undefined values:
## log_lik[79],The following variables have undefined values: log_lik[80],The
## following variables have undefined values: log_lik[81],The following
## variables have undefined values: log_lik[82],The following variables have
## undefined values: log_lik[83],The following variables have undefined values:
## log_lik[84],The following variables have undefined values: log_lik[85],The
## following variables have undefined values: log_lik[86],The following
## variables have undefined values: log_lik[87],The following variables have
## undefined values: log_lik[88],The following variables have undefined values:
## log_lik[89],The following variables have undefined values: log_lik[90],The
## following variables have undefined values: log_lik[91],The following
## variables have undefined values: log_lik[92],The following variables have
## undefined values: log_lik[93],The following variables have undefined values:
## log_lik[94],The following variables have undefined values: log_lik[95],The
## following variables have undefined values: log_lik[96],The following
## variables have undefined values: log_lik[97],The following variables have
## undefined values: log_lik[98],The following variables have undefined values:
## log_lik[99],The following variables have undefined values: log_lik[100],The
## following variables have undefined values: log_lik[101],The following
## variables have undefined values: log_lik[102],The following variables have
## undefined values: log_lik[103],The following variables have undefined values:
## log_lik[104],The following variables have undefined values: log_lik[105],The
## following variables have undefined values: log_lik[106],The following
## variables have undefined values: log_lik[107],The following variables have
## undefined values: log_lik[108],The following variables have undefined values:
## log_lik[109],The following variables have undefined values: log_lik[110],The
## following variables have undefined values: log_lik[111],The following
## variables have undefined values: log_lik[112],The following variables have
## undefined values: log_lik[113],The following variables have undefined values:
## log_lik[114],The following variables have undefined values: log_lik[115],The
## following variables have undefined values: log_lik[116],The following
## variables have undefined values: log_lik[117],The following variables have
## undefined values: log_lik[118],The following variables have undefined values:
## log_lik[119],The following variables have undefined values: log_lik[120],The
## following variables have undefined values: log_lik[121],The following
## variables have undefined values: log_lik[122],The following variables have
## undefined values: log_lik[123],The following variables have undefined values:
## log_lik[124],The following variables have undefined values: log_lik[125],The
## following variables have undefined values: log_lik[126],The following
## variables have undefined values: log_lik[127],The following variables have
## undefined values: log_lik[128],The following variables have undefined values:
## log_lik[129],The following variables have undefined values: log_lik[130],The
## following variables have undefined values: log_lik[131],The following
## variables have undefined values: log_lik[132],The following variables have
## undefined values: log_lik[133],The following variables have undefined values:
## log_lik[134],The following variables have undefined values: log_lik[135],The
## following variables have undefined values: log_lik[136],Th
t_m3 = Sys.time() 

Summarize_BWQS_performance <- function(model, df){
         model_type = deparse(substitute(model))
         waic = extract_waic(model)$waic[[1]]
         bayesr2 = Compute_Bayes_R2(model)$meanr2
         rmse = sqrt(mean((df$pos_per_100000 - colMeans(extract(model,"y_new")$y_new))^2))  
         effect_estimate = exp(mean(extract(model, "beta1")$beta1))
         
         summarized <- bind_cols(model_name = as.character(model_type), waic = waic, bayesr2 = bayesr2, rmse = rmse, effect_estimate = effect_estimate)
         
         return(summarized)
}

bind_rows(Summarize_BWQS_performance(m1, SES_zcta_median_testing),
  Summarize_BWQS_performance(m2_median, SES_zcta_median_testing),
  Summarize_BWQS_performance(m2_3q, SES_zcta_3q_testing),
)
## # A tibble: 3 x 5
##   model_name  waic bayesr2  rmse effect_estimate
##   <chr>      <dbl>   <dbl> <dbl>           <dbl>
## 1 m1         2370.   0.934  184.            1.08
## 2 m2_median  2362.   0.936  185.            1.08
## 3 m2_3q      2359.   0.937  183.            1.08

Part 2: Compare capacity to socially distance (as measured by transit data) by neighborhood-level risk scores

#### Part 2: Compare capacity to socially distance (as measured by transit data) by neighborhood-level risk scores ####  

UHF_ZipCodes1 <- UHF_ZipCodes %>%
  mutate(zcta = as.character(zip),
         uhf = as.character(code)) %>%
  dplyr::select(zcta, uhf)
  
ZCTA_BWQS_score <- ZCTA_BWQS_COVID_shp %>%
  st_drop_geometry() %>%
  dplyr::select(zcta, BWQS_index)
                
UHF_BWQS <- ZCTA_ACS_COVID %>%
  left_join(., UHF_ZipCodes1, by = "zcta") %>%
  left_join(., ZCTA_BWQS_score, join = "zcta") %>%
  group_by(uhf) %>%
  summarise(uhf_weighted_bwqs = weighted.mean(BWQS_index, total_pop1)) #population weighting
## Joining, by = "zcta"
UHF_BWQS_COVID_shp <- UHF_shp %>% 
  mutate(uhf = as.character(UHFCODE)) %>%
  left_join(., UHF_BWQS, by = "uhf") %>%
  mutate(Risk = factor(ifelse(uhf_weighted_bwqs > median(uhf_weighted_bwqs, na.rm = T), #median split of neighborhood risk
                              "High", "Low"), levels = c("High", "Low")))

ggplot(UHF_BWQS_COVID_shp) + 
  geom_sf(aes(fill = uhf_weighted_bwqs))

Subway_ridership_by_UHF %>% filter(place=="all" & date >= "2020-01-29" & date <= "2020-04-30") %>% 
           ggplot() + geom_point(aes(date, usage.median.ratio, color = place))

Mean_Ridership <- Subway_ridership_by_UHF %>% #Use the mean ridership to identify the proper function for a nonlinear model
  filter(date >= "2020-01-29" & date <= "2020-04-30" & place=="all") %>%   
  arrange(date) %>%
  mutate(time_index = time(date))

Mean_Ridership %>% 
  ggplot() + geom_point(aes(date, usage.median.ratio))

#The weibull probability distribution works best for these data
fit_drm_w2.4 <- drm(usage.median.ratio ~ time_index, fct =  W2.4(), data = Mean_Ridership)

# suppress warning about vector recycling in predict.drc.R
handler <- function(w) if( any( grepl( "Recycling array of length 1 in array-vector arithmetic is deprecated.", w) ) ){
  invokeRestart( "muffleWarning" ) } 

DRM_mean_predictions <- bind_cols(Mean_Ridership,
                                  as_tibble(withCallingHandlers(predict(fit_drm_w2.4, interval = "confidence"), warning = handler ))) 

# Supplementary Figure 6
sfig6 <- ggplot() + geom_point(data = DRM_mean_predictions, aes(x = Mean_Ridership$date, y = Mean_Ridership$usage.median.ratio)) + 
  geom_ribbon(data = DRM_mean_predictions, aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50", alpha = .5) +
  geom_line(aes(x = DRM_mean_predictions$date, y = DRM_mean_predictions$Prediction), color = "red") + 
  theme_bw(base_size = 16) +
  xlab("Date") +
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent)
if(export.figs) ggsave(sfig6, filename = file.path(fig.path, paste0("sfig6", "_", Sys.Date(), ".png")), 
                       device = "png", dpi = 300, width = 8, height = 5)

# create a dataframe for the analysis 
service_changes_in_lowsubway_areas <- tibble(date = as.Date(c("2020-02-01", "2020-02-02", "2020-02-08", "2020-02-09", "2020-02-15", "2020-02-16", "2020-02-22", "2020-02-23", "2020-02-29", "2020-03-01", "2020-03-07", "2020-03-08", 
                                                      "2020-02-01", "2020-02-02", "2020-02-08", "2020-02-09", "2020-02-16", "2020-02-16")),
                                             place = c("403","403","403","403","403","403","403","403","403","403","403","403", 
                                                       "502","502","502","502","502","502"))

Subway_BWQS_df <- Subway_ridership_by_UHF %>%
  left_join(., st_set_geometry(UHF_BWQS_COVID_shp, NULL), by = c("place" = "uhf")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  filter(!is.na(Risk) & date != "2020-02-17") %>% # removing Presidents' Day national holiday 
  anti_join(., service_changes_in_lowsubway_areas, by = c("date", "place")) %>% # removing low outliers due to service changes in low subway density areas
  ungroup()
  
fit_drm_all <- drm(usage.median.ratio ~ time_index, fct = W2.4(), data = Subway_BWQS_df)
plot(fit_drm_all)

fit_drm_interact <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, separate = T) #using this for inference
fit_drm_interact1 <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, separate = F) #using this for plotting -- can't get separate = T to plot correctly!
fit_drm_risk_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, subset = Risk=="High")
fit_drm_risk_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, subset = Risk=="Low")
anova(fit_drm_all, fit_drm_interact) # comparing the mean only model to the interaction model 
## 
## 1st model
##  fct:      W2.4()
## 2nd model
##  fct:      NULL
## ANOVA table
## 
##           ModelDf     RSS Df F value p value
## 1st model    3291 21.3020                   
## 2nd model    3287  9.7062  4  981.73    0.00
summary(fit_drm_interact)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##           Estimate  Std. Error t-value   p-value    
## b:Low  -11.3031622   0.3539527 -31.934 < 2.2e-16 ***
## c:Low    0.0948445   0.0032081  29.564 < 2.2e-16 ***
## d:Low    0.9771921   0.0028358 344.594 < 2.2e-16 ***
## e:Low   44.8234259   0.0974926 459.762 < 2.2e-16 ***
## b:High -10.9375092   0.3344433 -32.704 < 2.2e-16 ***
## c:High   0.1600641   0.0033505  47.773 < 2.2e-16 ***
## d:High   0.9643656   0.0027745 347.587 < 2.2e-16 ***
## e:High  46.5688477   0.1056759 440.676 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.07299479 (3287 degrees of freedom)
confint(fit_drm_interact)
##               2.5 %      97.5 %
## b:Low  -11.99715225 -10.6091722
## c:Low    0.08855447   0.1011346
## d:Low    0.97163205   0.9827522
## e:Low   44.63227355  45.0145782
## b:High -11.59324749 -10.2817710
## c:High   0.15349477   0.1666334
## d:High   0.95892577   0.9698054
## e:High  46.36165053  46.7760449
compParm(fit_drm_interact, "b", "-")
## 
## Comparison of parameter 'b' 
## 
##          Estimate Std. Error t-value p-value
## Low-High -0.36565    0.48696 -0.7509  0.4528
compParm(fit_drm_interact, "c", "-")
## 
## Comparison of parameter 'c' 
## 
##            Estimate Std. Error t-value   p-value    
## Low-High -0.0652196  0.0046387  -14.06 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
slopes <- as_tibble(coef(fit_drm_interact), rownames = "vars") %>%
  separate(col = vars, into = c("vars", "BWQS_risk"), sep = ":") %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e)))

as_tibble(confint(fit_drm_interact), rownames = "vars") %>%
  separate(col = vars, into = c("vars", "BWQS_risk"), sep = ":") %>%
  filter(vars == "b") %>%
  right_join(., slopes, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
  slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 2 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 Low       -0.0616        -0.0654         -0.0578
## 2 High      -0.0566        -0.0600         -0.0532
fit_drm_predictions <- as_tibble(withCallingHandlers(predict(fit_drm_interact1, as.data.frame(Subway_BWQS_df %>% dplyr::select(usage.median.ratio, time_index, Risk)), 
                                                             interval = "confidence"), warning = handler))
Subway_BWQS_df1 <- bind_cols(Subway_BWQS_df, fit_drm_predictions) 

Subway_BWQS_df2 <- Subway_BWQS_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = if_else(Risk == "High", "High (above median)", "Low (below median)"))

fig5 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_df2, aes(x = date, y = usage.median.ratio, color = Risk), alpha = .5, position = position_jitter(height = 0, width = 0.4))+ 
  geom_ribbon(data = subset(Subway_BWQS_df2, Risk == "High (above median)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_df2, Risk == "Low (below median)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_df2, aes(x = date, y = Prediction, color = Risk)) +
  scale_x_date("Date", date_minor_breaks = "1 week") + 
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs) ggsave(fig5, filename = file.path(fig.path, paste0("fig5", "_", Sys.Date() ,".png")), 
                       dpi = 300, width = 8, height = 6)

# which UHF neighborhoods were dropped?
included_uhf <- Subway_BWQS_df %>% distinct(UHFCODE)
notincluded_uhf_shp <- UHF_BWQS_COVID_shp %>%
  filter(!UHFCODE %in% included_uhf$UHFCODE & 
           UHFCODE !=0) %>%
  mutate(NotIncluded = "*")

# Supplementary Figure 5
sfig5 <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = NYC_basemap_shp, size = 0.2) +
  geom_sf(data = subset(UHF_BWQS_COVID_shp, !is.na(Risk)), aes(fill = Risk), size = 0.2) + 
  labs(fill = "COVID-19 inequity index\ngrouping") + 
  geom_sf(data = SubwayStation_shp, size = 0.6) +
  geom_sf_text(data = notincluded_uhf_shp, aes(label = NotIncluded), size = 9, color = "grey15") +
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  theme_bw() + 
  theme_smallmaps + 
  theme(legend.position = c(0.22, 0.86), plot.margin = margin(0, 0, 0, 0, "pt"))
if(export.figs) ggsave(sfig5, filename = file.path(fig.path, paste0("sfig5", "_", Sys.Date(),".png")), 
                       dpi = 300, width = 4, height = 4)

Part 3: Spatial analysis of mortality in relation to BWQS scores

#### Part 3: Spatial analysis of mortality in relation to BWQS scores  ####

# Step 1: Create dataframes with the relevant information 
deaths_by23May2020_by_zcta1 <- deaths_by23May2020_by_zcta %>%
  left_join(., modzcta_to_zcta, by = c("MODIFIED_ZCTA" = "MODZCTA")) %>%
  mutate(zcta = as.character(MODIFIED_ZCTA)) %>%
  rename("deaths_count" = "COVID_DEATH_COUNT") %>%
  dplyr::select(zcta, deaths_count, COVID_DEATH_RATE) %>%
  distinct(zcta, .keep_all = T)

ZCTA_BWQS_COVID_shp1 <- ZCTA_ACS_COVID_shp %>% 
  left_join(.,ZCTA_BWQS_score, by = "zcta") %>%
  left_join(., deaths_by23May2020_by_zcta1, by = "zcta") %>%
  mutate(prop_65plus = age65_plus/total_pop1,
         zcta = as.numeric(zcta)) 

# Figure 3C - Mortality
fig3c <- ZCTA_BWQS_COVID_shp1 %>% 
  ggplot() +
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(aes(fill = COVID_DEATH_RATE), lwd = 0.2)+
  scalebar(MODZCTA_NYC_shp1, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5,
           anchor = c(x = -73.71, y = 40.51)) + 
  labs(fill = "Mortality per 100,000") +
  ggtitle("Cumulative COVID mortality by zip code (May 23, 2020)") +
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  theme_bw(base_size = 6) + 
  theme_smallmaps


# Combine subfigures to make Figure 3: maps of BWQS, Tests, and Mortality by ZCTA
fig3a_2 = fig3a + labs(tag = "a") + theme(plot.tag.position = c(-0.018, 0.985), plot.tag = element_text(face = "bold", size = 15))
fig3b_2 = fig3b + labs(tag = "b") + theme(plot.tag.position = c(-0.028, 0.93), plot.tag = element_text(face = "bold", size = 15))
fig3c_2 = fig3c + labs(tag = "c") + theme(plot.tag.position = c(-0.028, 0.93), plot.tag = element_text(face = "bold", size = 15))

plotres = 300
agg_png(filename = file.path(fig.path, paste0("fig3_combined_", Sys.Date(), ".png")), 
        width = plotres*4.25, height = plotres*6, scaling = 2)
grid.arrange(arrangeGrob(fig3a_2, ncol = 1, nrow = 1), 
             arrangeGrob(fig3b_2, fig3c_2, ncol = 2, nrow = 1),
             nrow = 2, ncol = 1, heights = c(1.9,1))
dev.off()
## png 
##   2

# Step 2: Run negative binomial model with spatial filtering  

# Step 2a: Identify the neighbor relationships 
spdat.sens <- as_Spatial(ZCTA_BWQS_COVID_shp1)
ny.nb6 <- knearneigh(sp::coordinates(spdat.sens), k=6)
ny.nb6 <- knn2nb(ny.nb6)
ny.nb6 <- make.sym.nb(ny.nb6)
ny.wt6 <- nb2listw(ny.nb6, style="W")

# Step 2b: Fit the model to identify the component of the data with substantial spatial autocorrelation
fit.nb.ny.sens<-glm.nb(deaths_count~offset(log(total_pop1))+BWQS_index, spdat.sens)
lm.morantest(fit.nb.ny.sens, listw = ny.wt6)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = deaths_count ~ offset(log(total_pop1)) +
## BWQS_index, data = spdat.sens, init.theta = 6.682813998, link = log)
## weights: ny.wt6
## 
## Moran I statistic standard deviate = 3.1743, p-value = 0.000751
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.124763298     -0.001792051      0.001589528
me.fit.sens <- spatialreg::ME(deaths_count~offset(log(total_pop1))+BWQS_index,
                              spdat.sens@data, family=negative.binomial(fit.nb.ny.sens$theta), listw = ny.wt6, verbose=T, alpha=.1, nsim = 999)
## eV[,16], I: 0.05665305 ZI: NA, pr(ZI): 0.063
## eV[,23], I: 0.01207501 ZI: NA, pr(ZI): 0.342
# Step 2c: Pull out these fits and visualize the autocorrelation
fits.sens <- data.frame(fitted(me.fit.sens))
spdat.sens$me16 <- fits.sens$vec16
spdat.sens$me23 <- fits.sens$vec23
spplot(spdat.sens, "me16", at=quantile(spdat.sens$me16, p=seq(0,1,length.out = 7)))

spplot(spdat.sens, "me23", at=quantile(spdat.sens$me23, p=seq(0,1,length.out = 7)))

# Step 2d: Include the fits in our regression model as an additional parameter
clean.nb.sens<-glm.nb(deaths_count~offset(log(total_pop1))+BWQS_index+fitted(me.fit.sens), spdat.sens@data)
tidy(clean.nb.sens) %>% mutate(estimate_exp = exp(estimate))
## # A tibble: 4 x 6
##   term                     estimate std.error statistic  p.value estimate_exp
##   <chr>                       <dbl>     <dbl>     <dbl>    <dbl>        <dbl>
## 1 (Intercept)                -7.29     0.0839    -86.9  0.           0.000679
## 2 BWQS_index                  0.181    0.0151     12.0  3.73e-33     1.20    
## 3 fitted(me.fit.sens)vec16   -0.925    0.383      -2.41 1.58e- 2     0.397   
## 4 fitted(me.fit.sens)vec23   -1.78     0.381      -4.66 3.16e- 6     0.169
as_tibble(confint(clean.nb.sens), rownames = "vars")%>% mutate_at(vars(2:3), .funs = list(~exp(.)))
## Waiting for profiling to be done...
## # A tibble: 4 x 3
##   vars                      `2.5 %` `97.5 %`
##   <chr>                       <dbl>    <dbl>
## 1 (Intercept)              0.000575 0.000804
## 2 BWQS_index               1.16     1.23    
## 3 fitted(me.fit.sens)vec16 0.188    0.836   
## 4 fitted(me.fit.sens)vec23 0.0789   0.361
lm.morantest(clean.nb.sens, resfun = residuals, listw=ny.wt6)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = deaths_count ~ offset(log(total_pop1)) +
## BWQS_index + fitted(me.fit.sens), data = spdat.sens@data, init.theta =
## 8.00423828, link = log)
## weights: ny.wt6
## 
## Moran I statistic standard deviate = 1.643, p-value = 0.05019
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.063550312     -0.002549371      0.001618464
spdat.sens$nb_residuals <- clean.nb.sens$residuals
spplot(spdat.sens, "nb_residuals", at=quantile(spdat.sens$nb_residuals, p=seq(0,1,length.out = 10)))

pal_sfig10 <- brewer_pal(type = "div", palette = "RdYlBu")(5)[2:5] 
pal_sfig10[[2]] <- "#fafadf" # make yellow less saturated
sfig10 <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = st_as_sf(spdat.sens), aes(fill = cut_width(nb_residuals, width = 1)), lwd = 0.2) + 
  scale_fill_discrete(type = pal_sfig10) + 
  coord_sf(crs = st_crs(MODZCTA_NYC_shp1),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) + 
  labs(fill = "Residuals\n(Standard Deviations)") + 
  theme(panel.background = element_rect(fill = "#cccccc"), 
        panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.title = element_text(face = "bold", size = 9),
        legend.text = element_text(size = 8.5),
        plot.margin = unit(c(0.1,0.1,0.1,0.1), "in")
        )
if(export.figs) ggsave(sfig10, filename = file.path(fig.path, paste0("sfig10", "_", Sys.Date(),".png")), 
                       dpi = 300, width = 5, height = 3.2)

# Sensitivity Analyses

#### Sensitivity Analyses ####

# half-Cauchy prior for overdispersion parameter
BWQS_NB_halfcauchy_stan_model <- here("code", "nb_bwqs_halfcauchy.stan") 
m1_halfcauchy <- fit_BWQS_model(m1data$data_list, BWQS_NB_halfcauchy_stan_model)
# beta1 effect estimate using half-cauchy prior for overdispersion
exp(summary(m1_halfcauchy)$summary[3,c(1,4,8)])
##     mean     2.5%    97.5% 
## 1.077273 1.066034 1.089665
# main model (inverse gamma prior for overdispersion)
exp(summary(m1)$summary[3,c(1,4,8)])
##     mean     2.5%    97.5% 
## 1.076975 1.064520 1.089281
# no meaningful difference in effect estimate or credible interval!

# BWQS weights -- stability of the rankings 
as_tibble(extract(m1, c("W[1]","W[2]", "W[3]", "W[4]", "W[5]", "W[6]", "W[7]", "W[8]", "W[9]", "W[10]"))) %>%
  rownames_to_column("iteration") %>%
  gather(weight, value, "W[1]":"W[10]") %>%
  group_by(iteration) %>%
  slice(which.max(value)) %>%
  ungroup() %>%
    group_by(weight) %>%
    dplyr::summarise(times_largest_weight = n()) %>%
    mutate(pct_largest_weight = round((times_largest_weight/1750)*100, 2))
## # A tibble: 8 x 3
##   weight times_largest_weight pct_largest_weight
##   <chr>                 <int>              <dbl>
## 1 W[1]                     94               5.37
## 2 W[10]                   457              26.1 
## 3 W[2]                    564              32.2 
## 4 W[4]                      4               0.23
## 5 W[5]                     13               0.74
## 6 W[7]                    504              28.8 
## 7 W[8]                     35               2   
## 8 W[9]                     79               4.51
# using essential workers, testing ratio, and median income
glm_esstl_and_income <- glm.nb(pos_per_100000 ~ not_quarantined_jobs + medincome + testing_ratio, data = ZCTA_ACS_COVID)
broom::tidy(glm_esstl_and_income)
## # A tibble: 4 x 5
##   term                    estimate   std.error statistic   p.value
##   <chr>                      <dbl>       <dbl>     <dbl>     <dbl>
## 1 (Intercept)           6.34       0.0701          90.4  0.       
## 2 not_quarantined_jobs  0.0185     0.00248          7.46 8.96e- 14
## 3 medincome            -0.00000250 0.000000359     -6.96 3.36e- 12
## 4 testing_ratio        19.5        0.888           22.0  4.07e-107
enframe(predict(glm_esstl_and_income)) %>% mutate(value = exp(value))
## # A tibble: 177 x 2
##    name  value
##    <chr> <dbl>
##  1 1     1293.
##  2 2     1301.
##  3 3      877.
##  4 4      870.
##  5 5      710.
##  6 6      787.
##  7 7      677.
##  8 8     1200.
##  9 9      987.
## 10 10     891.
## # … with 167 more rows
# Just using proportion uninsured and median income
glm_insurance_and_income <- glm.nb(pos_per_100000 ~ not_insured + medincome, data = ZCTA_ACS_COVID)
broom::tidy(glm_insurance_and_income)
## # A tibble: 3 x 5
##   term           estimate   std.error statistic  p.value
##   <chr>             <dbl>       <dbl>     <dbl>    <dbl>
## 1 (Intercept)  8.14       0.120          68.0   0.      
## 2 not_insured -0.00420    0.00857        -0.490 6.24e- 1
## 3 medincome   -0.00000737 0.000000891    -8.26  1.41e-16
# PCA of social variables 
pca_socialvars <- prcomp(ZCTA_ACS_COVID %>% dplyr::select(all_of(SES_vars)), center = T, scale. = T)
PC1 <- enframe(pca_socialvars$rotation[,1]) %>% arrange(desc(value)) %>% rename("label" = "name")

df_for_PCA_analysis <- ZCTA_ACS_COVID %>% bind_cols(., PC1 = pca_socialvars$x[,1])

# Compare BWQS ranks to Principal Components ranks
BWQS_fits %>%
  dplyr::select(mean, label) %>%
  mutate(rank_bwqs = rank(mean)) %>%
  left_join(., PC1, by = "label") %>%
  mutate(rank_pca = rank(value)) %>%
  dplyr::select(2, 3, 5, 1, 4)
## # A tibble: 10 x 5
##    label                     rank_bwqs rank_pca   mean  value
##    <chr>                         <dbl>    <dbl>  <dbl>  <dbl>
##  1 not_insured                      10        4 0.177  0.317 
##  2 essentialworker_drove             9        2 0.166  0.201 
##  3 avg_hhold_size                    8        9 0.161  0.382 
##  4 res_vol_popdensity                7        3 0.108  0.217 
##  5 one_over_medincome                6        8 0.105  0.363 
##  6 didnot_workhome_commute           5        7 0.0894 0.359 
##  7 not_quarantined_jobs              4       10 0.0618 0.397 
##  8 unemployed                        3        6 0.0547 0.358 
##  9 essentialworker_pubtrans          2        5 0.0498 0.335 
## 10 one_over_grocers_per_1000         1        1 0.0267 0.0891
glm_princomp <- glm.nb(pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), data = df_for_PCA_analysis)
summary(glm_princomp)
## 
## Call:
## glm.nb(formula = pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), 
##     data = df_for_PCA_analysis, init.theta = 94.1393225, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5328  -0.5874   0.0735   0.5595   3.0695  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                6.635507   0.047025  141.10   <2e-16 ***
## PC1                        0.071785   0.004996   14.37   <2e-16 ***
## ns(testing_ratio, df = 3)1 0.980565   0.042292   23.19   <2e-16 ***
## ns(testing_ratio, df = 3)2 2.033267   0.106250   19.14   <2e-16 ***
## ns(testing_ratio, df = 3)3 1.135913   0.053253   21.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(94.1393) family taken to be 1)
## 
##     Null deviance: 2797.59  on 176  degrees of freedom
## Residual deviance:  179.89  on 172  degrees of freedom
## AIC: 2381
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  94.1 
##           Std. Err.:  10.7 
## 
##  2 x log-likelihood:  -2368.997
exp(confint(glm.nb(pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), data = df_for_PCA_analysis)))
## Waiting for profiling to be done...
##                                 2.5 %     97.5 %
## (Intercept)                693.994592 836.504296
## PC1                          1.063870   1.085062
## ns(testing_ratio, df = 3)1   2.456486   2.893115
## ns(testing_ratio, df = 3)2   6.178602   9.435865
## ns(testing_ratio, df = 3)3   2.805253   3.461891
enframe(predict(glm_princomp)) %>% mutate(glm_princomp = exp(value)) %>% dplyr::select(glm_princomp)
## # A tibble: 177 x 1
##    glm_princomp
##           <dbl>
##  1        1333.
##  2        1088.
##  3         727.
##  4         943.
##  5         741.
##  6         819.
##  7         891.
##  8        1065.
##  9         920.
## 10         809.
## # … with 167 more rows
enframe(predict(glm_esstl_and_income)) %>% mutate(glm_esstl_and_income = exp(value)) %>% dplyr::select(glm_esstl_and_income)
## # A tibble: 177 x 1
##    glm_esstl_and_income
##                   <dbl>
##  1                1293.
##  2                1301.
##  3                 877.
##  4                 870.
##  5                 710.
##  6                 787.
##  7                 677.
##  8                1200.
##  9                 987.
## 10                 891.
## # … with 167 more rows
Compare_Metrics <- bind_cols(ZCTA_ACS_COVID,
                             glm_princomp = enframe(predict(glm_princomp)) %>% mutate(glm_princomp = exp(value)) %>% dplyr::select(glm_princomp),
                             glm_esstl_and_income = enframe(predict(glm_esstl_and_income)) %>% mutate(glm_esstl_and_income = exp(value)) %>% dplyr::select(glm_esstl_and_income),
                             BWQS_index = colMeans(extract(m1,"y_new")$y_new))

# comparison of RMSE and Kendall's tau metrics for three different modeling approaches
# Supplemental Table 4
bind_rows(Compare_Metrics %>% 
            summarise_at(vars(c("BWQS_index", "glm_princomp", "glm_esstl_and_income")), 
                         ~cor(pos_per_100000, .x, method = "kendall")) %>%
            mutate(parameter = "kendalls"),
          Compare_Metrics %>% 
            summarise_at(vars(c("BWQS_index", "glm_princomp", "glm_esstl_and_income")), 
                         ~sqrt(mean(pos_per_100000-.x)^2)) %>%
            mutate(parameter = "RMSE"))
## # A tibble: 2 x 4
##   BWQS_index glm_princomp glm_esstl_and_income parameter
##        <dbl>        <dbl>                <dbl> <chr>    
## 1      0.869        0.850                0.823 kendalls 
## 2      2.79         2.13                 8.10  RMSE

Subway analyses

#### Subway analyses ####

# Different BWQS groupings for subway analysis (<.25, .25-.75, .75)

UHF_BWQS_COVID_3split_shp <- UHF_shp %>% 
  mutate(uhf = as.character(UHFCODE)) %>%
  left_join(., UHF_BWQS, by = "uhf") %>%
  mutate(Risk = factor(ifelse(uhf_weighted_bwqs < quantile(uhf_weighted_bwqs, probs = .25, na.rm = T), "Low",
                              if_else(uhf_weighted_bwqs > quantile(uhf_weighted_bwqs, probs = .75, na.rm = T), "High", "Mid")),
                       levels = c("High", "Mid", "Low")))


Subway_BWQS_3split_df <- Subway_ridership_by_UHF %>%
  left_join(., st_set_geometry(UHF_BWQS_COVID_3split_shp, NULL), by = c("place" = "uhf")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  filter(!is.na(Risk) & date != "2020-02-17") %>% #removing Presidents' Day national holiday 
  anti_join(., service_changes_in_lowsubway_areas, by = c("date", "place")) #removing low outliers due to service changes in low subway density areas

fit_drm_risk_3split <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, separate = F)
anova(fit_drm_all, fit_drm_risk_3split)
## 
## 1st model
##  fct:      W2.4()
##  pmodels: 1 (for all parameters)
## 2nd model
##  fct:      W2.4()
##  pmodels: Risk (for all parameters)
## ANOVA table
## 
##           ModelDf    RSS Df F value p value
## 1st model    3291 21.302                   
## 2nd model    3283 15.432  8   156.1     0.0
summary(fit_drm_risk_3split)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##           Estimate  Std. Error t-value   p-value    
## b:Low  -12.0668117   0.4809539 -25.089 < 2.2e-16 ***
## b:Mid  -11.0653059   0.3525339 -31.388 < 2.2e-16 ***
## b:High -10.8697531   0.3999555 -27.177 < 2.2e-16 ***
## c:Low    0.0728584   0.0038983  18.690 < 2.2e-16 ***
## c:Mid    0.1294325   0.0033939  38.137 < 2.2e-16 ***
## c:High   0.1751494   0.0040700  43.034 < 2.2e-16 ***
## d:Low    0.9805997   0.0035709 274.612 < 2.2e-16 ***
## d:Mid    0.9598153   0.0028993 331.053 < 2.2e-16 ***
##  [ reached getOption("max.print") -- omitted 4 rows ]
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.06856061 (3283 degrees of freedom)
compParm(fit_drm_risk_3split, "b", operator = "-")
## 
## Comparison of parameter 'b' 
## 
##          Estimate Std. Error t-value p-value  
## Low-Mid  -1.00151    0.59632 -1.6795 0.09315 .
## Low-High -1.19706    0.62552 -1.9137 0.05575 .
## Mid-High -0.19555    0.53315 -0.3668 0.71380  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
compParm(fit_drm_risk_3split, "c", operator = "-")
## 
## Comparison of parameter 'c' 
## 
##            Estimate Std. Error  t-value   p-value    
## Low-Mid  -0.0565742  0.0051687 -10.9456 < 2.2e-16 ***
## Low-High -0.1022910  0.0056357 -18.1504 < 2.2e-16 ***
## Mid-High -0.0457168  0.0052994  -8.6268 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fit_drm_risk_3split)

fit_drm_risk_3split_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="High")
fit_drm_risk_3split_mid <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="Mid")
fit_drm_risk_3split_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="Low")

summary(fit_drm_risk_3split_high)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -10.8693720   0.3278018 -33.158 < 2.2e-16 ***
## c:(Intercept)   0.1751430   0.0033359  52.502 < 2.2e-16 ***
## d:(Intercept)   0.9758825   0.0027014 361.256 < 2.2e-16 ***
## e:(Intercept)  46.8619040   0.1050027 446.292 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.05619323 (1008 degrees of freedom)
summary(fit_drm_risk_3split_mid)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.0658766   0.3964936 -27.909 < 2.2e-16 ***
## c:(Intercept)   0.1294362   0.0038167  33.913 < 2.2e-16 ***
## d:(Intercept)   0.9598145   0.0032605 294.376 < 2.2e-16 ***
## e:(Intercept)  45.9159522   0.1187447 386.678 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.07710264 (1359 degrees of freedom)
summary(fit_drm_risk_3split_low)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -12.0672287   0.4733213 -25.495 < 2.2e-16 ***
## c:(Intercept)   0.0728676   0.0038363  18.994 < 2.2e-16 ***
## d:(Intercept)   0.9805959   0.0035141 279.045 < 2.2e-16 ***
## e:(Intercept)  44.2637382   0.1136008 389.643 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.06747137 (916 degrees of freedom)
confint(fit_drm_risk_3split_high)
##                     2.5 %      97.5 %
## b:(Intercept) -11.5126240 -10.2261199
## c:(Intercept)   0.1685968   0.1816891
## d:(Intercept)   0.9705815   0.9811834
## e:(Intercept)  46.6558551  47.0679529
confint(fit_drm_risk_3split_mid)
##                     2.5 %      97.5 %
## b:(Intercept) -11.8436826 -10.2880707
## c:(Intercept)   0.1219489   0.1369234
## d:(Intercept)   0.9534184   0.9662107
## e:(Intercept)  45.6830095  46.1488949
confint(fit_drm_risk_3split_low)
##                      2.5 %       97.5 %
## b:(Intercept) -12.99614880 -11.13830857
## c:(Intercept)   0.06533873   0.08039652
## d:(Intercept)   0.97369930   0.98749260
## e:(Intercept)  44.04079005  44.48668636
# b is not actually a slope - but a scaling factor that needs to be transformed into the slope
slopes_3split <- as_tibble(coef(fit_drm_risk_3split_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e))) 

as_tibble(confint(fit_drm_risk_3split_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  filter(vars == "b") %>%
  right_join(., slopes_3split, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
         slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 3 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 high      -0.0566        -0.0599         -0.0532
## 2 mid       -0.0578        -0.0619         -0.0538
## 3 low       -0.0668        -0.0720         -0.0617
fit_drm_predictions_3split <- as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_high, interval = "confidence"), warning = handler) %>% 
                                   bind_cols(., Risk = "High")) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_mid, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Mid"))) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_low, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Low")))

Subway_BWQS_3split_df1 <- bind_cols(Subway_BWQS_3split_df %>% arrange(Risk, date), 
                             fit_drm_predictions_3split %>% dplyr::select(-Risk)) 


Subway_BWQS_3split_df2 <- Subway_BWQS_3split_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = factor(if_else(Risk == "High", "High (≥ 75%ile)", 
                        if_else(Risk=="Mid", "Mid (IQR)", "Low (≤ 25%ile)")),
                       levels = c("High (≥ 75%ile)", "Mid (IQR)", "Low (≤ 25%ile)")))

# Supplementary Figure 7
sfig7 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_3split_df2, aes(x = date, y = usage.median.ratio, color = as.factor(Risk)), alpha = .5, position = position_jitter(height = 0, width = 0.4))+ 
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "High (≥ 75%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "Mid (IQR)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "Low (≤ 25%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_3split_df2, aes(x = date, y = Prediction, color = as.factor(Risk))) +
  scale_x_date("Date", date_minor_breaks = "1 week") +
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs) ggsave(sfig7 , filename = file.path(fig.path, paste0("sfig7 ", "_", Sys.Date(),".png")), 
                       dpi = 300, width = 8, height = 6)

# ZCTA level BWQS for subway 
Subway_ridership_by_ZCTA <- relative.subway.usage(2020, by = "zcta")

# There are clearly some higher outliers that need to be removed at the zcta level 
# Remove the zctas associated with the uhfs below

high_april_bronx_subway <- tibble(date = as.Date(c("2020-04-04", "2020-04-05", "2020-04-11", "2020-04-12", "2020-04-18", "2020-04-19", "2020-04-25", "2020-04-26")),
                                  place = c("10469", "10469","10469","10469","10469","10469","10469","10469"))

Subway_BWQS_ZCTA_df <- Subway_ridership_by_ZCTA %>%
  left_join(., ZCTA_BWQS_score, by = c("place" = "zcta")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  ungroup() %>%
  filter(!is.na(BWQS_index) & date != "2020-02-17") %>% #removing Presidents' Day national holiday 
  anti_join(., high_april_bronx_subway, by = c("date", "place")) %>% #removing low outliers due to service changes in low subway density areas
  mutate(Risk = factor(if_else(BWQS_index < quantile(BWQS_index, probs = .25), "Low",
                              if_else(BWQS_index > quantile(BWQS_index, probs = .75), "High", "Mid")),
                       levels = c("High", "Mid", "Low")))

fit_drm_risk_3split_zcta_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="High")
fit_drm_risk_3split_zcta_mid <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="Mid")
fit_drm_risk_3split_zcta_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="Low")


summary(fit_drm_risk_3split_zcta_high)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -10.9137233   0.3404024 -32.061 < 2.2e-16 ***
## c:(Intercept)   0.1686448   0.0034263  49.221 < 2.2e-16 ***
## d:(Intercept)   0.9566853   0.0027549 347.266 < 2.2e-16 ***
## e:(Intercept)  46.9474325   0.1081606 434.053 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.09486494 (2748 degrees of freedom)
summary(fit_drm_risk_3split_zcta_mid)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.3113739   0.3758877 -30.092 < 2.2e-16 ***
## c:(Intercept)   0.1340174   0.0035621  37.623 < 2.2e-16 ***
## d:(Intercept)   0.9817241   0.0030165 325.456 < 2.2e-16 ***
## e:(Intercept)  45.9104803   0.1072824 427.940 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.1474759 (5700 degrees of freedom)
summary(fit_drm_risk_3split_zcta_low)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.8046717   0.3455007 -34.167 < 2.2e-16 ***
## c:(Intercept)   0.0619258   0.0028874  21.447 < 2.2e-16 ***
## d:(Intercept)   0.9886886   0.0026770 369.327 < 2.2e-16 ***
## e:(Intercept)  43.9322297   0.0871923 503.854 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.08812689 (2756 degrees of freedom)
# b is not actually a slope - but a scaling factor that needs to be transformed into the slope
slopes_zcta <- as_tibble(coef(fit_drm_risk_3split_zcta_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_zcta_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_zcta_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e))) 

as_tibble(confint(fit_drm_risk_3split_zcta_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_zcta_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_zcta_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  filter(vars == "b") %>%
  right_join(., slopes_zcta, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
         slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 3 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 high      -0.0556        -0.0590         -0.0522
## 2 mid       -0.0605        -0.0644         -0.0565
## 3 low       -0.0664        -0.0702         -0.0626
fit_drm_predictions_zcta <- as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_high, interval = "confidence"), warning = handler) %>% 
                                   bind_cols(., Risk = "High")) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_mid, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Mid"))) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_low, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Low"))) # %>% mutate(time_index = row_number()) 

Subway_BWQS_ZCTA_df1 <- bind_cols(Subway_BWQS_ZCTA_df %>% arrange(Risk, date), 
                             fit_drm_predictions_zcta %>% dplyr::select(-Risk)) 


Subway_BWQS_ZCTA_df2 <- Subway_BWQS_ZCTA_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = factor(if_else(Risk == "High", "High (≥ 75%ile)", 
                        if_else(Risk=="Mid", "Mid (IQR)", "Low (≤ 25%ile)")), 
                       levels = c("High (≥ 75%ile)", "Mid (IQR)", "Low (≤ 25%ile)")))

# Supplementary Figure 8
sfig8 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_ZCTA_df2, aes(x = date, y = usage.median.ratio, color = Risk), size = 0.3, alpha = 0.3, position = position_jitter(height = 0, width = 0.42))+ 
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "High (≥ 75%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "Mid (IQR)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "Low (≤ 25%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_ZCTA_df2, aes(x = date, y = Prediction, color = Risk)) +
  scale_x_date("Date", date_minor_breaks = "1 week") + 
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs) ggsave(sfig8 , filename = file.path(fig.path, paste0("sfig8", "_", Sys.Date(),".png")), 
                       dpi = 300, width = 8, height = 6)

# Supplementary Figure 9: compare MTA turnstile data to Google mobility reports
source(here("code/mta_vs_google.R"), echo = FALSE)
mobplot

Appendix

#### Appendix #### 
t_final = Sys.time()

Runtimes

print(t_start)
## [1] "2020-11-24 19:37:44 EST"
print(t_turnstile_1)
## [1] "2020-11-24 19:37:49 EST"
print(t_turnstile_2)
## [1] "2020-11-24 19:52:05 EST"
print(t_census)
## [1] "2020-11-24 19:57:07 EST"
print(t_dataprep)
## [1] "2020-11-24 20:05:43 EST"
print(t_m1)
## [1] "2020-11-24 20:13:43 EST"
print(t_postm1)
## [1] "2020-11-24 20:13:52 EST"
print(t_m2)
## [1] "2020-11-24 20:20:06 EST"
print(t_m3)
## [1] "2020-11-24 20:24:50 EST"
print(t_final)
## [1] "2020-11-24 20:32:08 EST"
# Total runtime:
print(t_final - t_start)
## Time difference of 54.40026 mins
# Time part 1: Script start through loading of MTA Turnstile package:
print(t_turnstile_1 - t_start)
## Time difference of 5.607934 secs
# Time part 2: Downloading (if this is the first run) and processing of subway turnstile data by neighborhood:
print(t_turnstile_2 - t_turnstile_1)
## Time difference of 14.25564 mins
# Time part 3: Downloading of all other data and processing of Census data:
print(t_census - t_turnstile_2)
## Time difference of 5.045368 mins
# Time part 4: All other data processing, modeling, and figure generation:
print(t_final - t_census)
## Time difference of 35.00579 mins

Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       Ubuntu 18.04.4 LTS          
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/New_York            
##  date     2020-11-24                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────
##  package        * version    date       lib source                                 
##  abind          * 1.4-5      2016-07-21 [2] CRAN (R 4.0.2)                         
##  acepack          1.4.1      2016-10-29 [2] CRAN (R 4.0.2)                         
##  askpass          1.1        2019-01-13 [2] CRAN (R 4.0.2)                         
##  assertthat       0.2.1      2019-03-21 [2] CRAN (R 4.0.2)                         
##  backports        1.1.8      2020-06-17 [2] CRAN (R 4.0.2)                         
##  base64enc        0.1-3      2015-07-28 [2] CRAN (R 4.0.2)                         
##  BBmisc           1.11       2017-03-10 [2] CRAN (R 4.0.2)                         
##  bit              1.1-15.2   2020-02-10 [2] CRAN (R 4.0.2)                         
##  bit64            0.9-7      2017-05-08 [2] CRAN (R 4.0.2)                         
##  bitops           1.0-6      2013-08-17 [2] CRAN (R 4.0.2)                         
##  blob             1.2.1      2020-01-20 [2] CRAN (R 4.0.2)                         
##  boot             1.3-25     2020-04-26 [3] CRAN (R 4.0.0)                         
##  broom          * 0.5.6      2020-04-20 [2] CRAN (R 4.0.2)                         
##  callr            3.4.3      2020-03-28 [2] CRAN (R 4.0.2)                         
##  car              3.0-8      2020-05-21 [2] CRAN (R 4.0.2)                         
##  carData          3.0-4      2020-05-22 [2] CRAN (R 4.0.2)                         
##  cellranger       1.1.0      2016-07-27 [2] CRAN (R 4.0.2)                         
##  checkmate        2.0.0      2020-02-06 [2] CRAN (R 4.0.2)                         
##  class            7.3-17     2020-04-26 [3] CRAN (R 4.0.0)                         
##  classInt         0.4-3      2020-04-07 [2] CRAN (R 4.0.2)                         
##  cli              2.0.2      2020-02-28 [2] CRAN (R 4.0.2)                         
##  cluster          2.1.0      2019-06-19 [3] CRAN (R 4.0.0)                         
##  coda             0.19-3     2019-07-05 [2] CRAN (R 4.0.2)                         
##  codetools        0.2-16     2018-12-24 [3] CRAN (R 4.0.0)                         
##  colorspace       1.4-1      2019-03-18 [2] CRAN (R 4.0.2)                         
##  corrplot       * 0.84       2017-10-16 [2] CRAN (R 4.0.2)                         
##  crayon           1.3.4      2017-09-16 [2] CRAN (R 4.0.2)                         
##  curl             4.3        2019-12-02 [2] CRAN (R 4.0.2)                         
##  data.table     * 1.13.0     2020-07-24 [1] CRAN (R 4.0.2)                         
##  DBI              1.1.0      2019-12-15 [2] CRAN (R 4.0.2)                         
##  dbplyr           1.4.4      2020-05-27 [2] CRAN (R 4.0.2)                         
##  deldir           0.1-25     2020-02-03 [2] CRAN (R 4.0.2)                         
##  DHARMa         * 0.3.3.0    2020-09-08 [2] CRAN (R 4.0.2)                         
##  digest           0.6.25     2020-02-23 [2] CRAN (R 4.0.2)                         
##  dplyr          * 1.0.0      2020-05-29 [2] CRAN (R 4.0.2)                         
##  drc            * 3.0-1      2016-08-30 [1] CRAN (R 4.0.2)                         
##  e1071            1.7-3      2019-11-26 [2] CRAN (R 4.0.2)                         
##  egg            * 0.4.5      2019-07-13 [2] CRAN (R 4.0.2)                         
##  ellipsis         0.3.1      2020-05-15 [2] CRAN (R 4.0.2)                         
##  evaluate         0.14       2019-05-28 [2] CRAN (R 4.0.2)                         
##  expm             0.999-4    2019-03-21 [2] CRAN (R 4.0.2)                         
##  fansi            0.4.1      2020-01-08 [2] CRAN (R 4.0.2)                         
##  farver           2.0.3      2020-01-16 [2] CRAN (R 4.0.2)                         
##  fastmap          1.0.1      2019-10-08 [2] CRAN (R 4.0.2)                         
##  fastmatch        1.1-0      2017-01-28 [2] CRAN (R 4.0.2)                         
##  forcats        * 0.5.0      2020-03-01 [2] CRAN (R 4.0.2)                         
##  foreach          1.5.0      2020-03-30 [2] CRAN (R 4.0.2)                         
##  foreign          0.8-79     2020-04-26 [3] CRAN (R 4.0.0)                         
##  Formula          1.2-3      2018-05-03 [2] CRAN (R 4.0.2)                         
##  fs               1.4.2      2020-06-30 [2] CRAN (R 4.0.2)                         
##  fst            * 0.9.2      2020-04-01 [2] CRAN (R 4.0.2)                         
##  gap              1.2.2      2020-02-02 [2] CRAN (R 4.0.2)                         
##  gdata            2.18.0     2017-06-06 [2] CRAN (R 4.0.2)                         
##  generics         0.0.2      2018-11-29 [2] CRAN (R 4.0.2)                         
##  ggExtra        * 0.9        2019-08-27 [2] CRAN (R 4.0.2)                         
##  ggmap            3.0.0      2019-02-05 [2] CRAN (R 4.0.2)                         
##  ggplot2        * 3.3.2      2020-06-19 [2] CRAN (R 4.0.2)                         
##  ggpubr         * 0.4.0      2020-06-27 [1] CRAN (R 4.0.2)                         
##  ggridges       * 0.5.2      2020-01-12 [2] CRAN (R 4.0.2)                         
##  ggsignif         0.6.0      2019-08-08 [1] CRAN (R 4.0.2)                         
##  ggsn           * 0.5.0      2019-02-18 [2] CRAN (R 4.0.2)                         
##  glue             1.4.2      2020-08-27 [1] CRAN (R 4.0.2)                         
##  gmodels          2.18.1     2018-06-25 [2] CRAN (R 4.0.2)                         
##  gridExtra      * 2.3        2017-09-09 [2] CRAN (R 4.0.2)                         
##  gtable           0.3.0      2019-03-25 [2] CRAN (R 4.0.2)                         
##  gtools           3.8.2      2020-03-31 [2] CRAN (R 4.0.2)                         
##  haven            2.3.1      2020-06-01 [2] CRAN (R 4.0.2)                         
##  here           * 0.1        2017-05-28 [2] CRAN (R 4.0.2)                         
##  highr            0.8        2019-03-20 [2] CRAN (R 4.0.2)                         
##  Hmisc            4.4-0      2020-03-23 [2] CRAN (R 4.0.2)                         
##  hms              0.5.3      2020-01-08 [2] CRAN (R 4.0.2)                         
##  htmlTable        2.0.0      2020-06-21 [2] CRAN (R 4.0.2)                         
##  htmltools        0.5.0      2020-06-16 [2] CRAN (R 4.0.2)                         
##  htmlwidgets      1.5.1      2019-10-08 [2] CRAN (R 4.0.2)                         
##  httpuv           1.5.4      2020-06-06 [2] CRAN (R 4.0.2)                         
##  httr           * 1.4.1      2019-08-05 [2] CRAN (R 4.0.2)                         
##  inline           0.3.15     2018-05-18 [2] CRAN (R 4.0.2)                         
##  iterators        1.0.12     2019-07-26 [2] CRAN (R 4.0.2)                         
##  jpeg             0.1-8.1    2019-10-24 [2] CRAN (R 4.0.2)                         
##  jsonlite       * 1.7.0      2020-06-25 [2] CRAN (R 4.0.2)                         
##  Just.universal * 0.0.0.9000 2020-11-24 [1] Github (justlab/Just_universal@4e6147a)
##  kableExtra     * 1.3.1      2020-10-22 [1] CRAN (R 4.0.2)                         
##  KernSmooth       2.23-17    2020-04-26 [3] CRAN (R 4.0.0)                         
##  knitr            1.29       2020-06-23 [2] CRAN (R 4.0.2)                         
##  labeling         0.3        2014-08-23 [2] CRAN (R 4.0.2)                         
##  later            1.1.0.1    2020-06-05 [2] CRAN (R 4.0.2)                         
##  lattice          0.20-41    2020-04-02 [3] CRAN (R 4.0.0)                         
##  latticeExtra     0.6-29     2019-12-19 [2] CRAN (R 4.0.2)                         
##  LearnBayes       2.15.1     2018-03-18 [2] CRAN (R 4.0.2)                         
##  lifecycle        0.2.0      2020-03-06 [2] CRAN (R 4.0.2)                         
##  lme4             1.1-23     2020-04-07 [2] CRAN (R 4.0.2)                         
##  loo              2.2.0      2019-12-19 [2] CRAN (R 4.0.2)                         
##  lubridate      * 1.7.9      2020-06-08 [2] CRAN (R 4.0.2)                         
##  lwgeom         * 0.2-5      2020-06-12 [2] CRAN (R 4.0.2)                         
##  magic          * 1.5-9      2018-09-17 [2] CRAN (R 4.0.2)                         
##  magrittr         1.5        2014-11-22 [2] CRAN (R 4.0.2)                         
##  maptools         1.0-1      2020-05-14 [2] CRAN (R 4.0.2)                         
##  MASS           * 7.3-51.6   2020-04-26 [3] CRAN (R 4.0.0)                         
##  Matrix         * 1.2-18     2019-11-27 [3] CRAN (R 4.0.0)                         
##  matrixStats    * 0.56.0     2020-03-13 [2] CRAN (R 4.0.2)                         
##  memoise          1.1.0      2017-04-21 [2] CRAN (R 4.0.2)                         
##  mgcv             1.8-31     2019-11-09 [3] CRAN (R 4.0.0)                         
##  mime             0.9        2020-02-04 [2] CRAN (R 4.0.2)                         
##  miniUI           0.1.1.1    2018-05-18 [2] CRAN (R 4.0.2)                         
##  minqa            1.2.4      2014-10-09 [2] CRAN (R 4.0.2)                         
##  modelr           0.1.8      2020-05-19 [2] CRAN (R 4.0.2)                         
##  MTA.turnstile  * 0.0.0.9001 2020-11-24 [1] Github (justlab/MTA_turnstile@544eba3) 
##  multcomp         1.4-13     2020-04-08 [2] CRAN (R 4.0.2)                         
##  munsell          0.5.0      2018-06-12 [2] CRAN (R 4.0.2)                         
##  mvtnorm          1.1-1      2020-06-09 [2] CRAN (R 4.0.2)                         
##  nlme             3.1-147    2020-04-13 [3] CRAN (R 4.0.0)                         
##  nloptr           1.2.2.1    2020-03-11 [2] CRAN (R 4.0.2)                         
##  nnet             7.3-14     2020-04-26 [3] CRAN (R 4.0.0)                         
##  openxlsx         4.1.5      2020-05-06 [2] CRAN (R 4.0.2)                         
##  ParamHelpers     1.14       2020-03-24 [2] CRAN (R 4.0.2)                         
##  pbapply          1.4-3      2020-08-18 [1] CRAN (R 4.0.2)                         
##  pdftools       * 2.3.1      2020-05-22 [1] CRAN (R 4.0.2)                         
##  pillar           1.4.4      2020-05-05 [2] CRAN (R 4.0.2)                         
##  pkgbuild         1.0.8      2020-05-07 [2] CRAN (R 4.0.2)                         
##  pkgconfig        2.0.3      2019-09-22 [2] CRAN (R 4.0.2)                         
##  plotrix          3.7-8      2020-04-16 [2] CRAN (R 4.0.2)                         
##  plyr             1.8.6      2020-03-03 [2] CRAN (R 4.0.2)                         
##  png              0.1-7      2013-12-03 [2] CRAN (R 4.0.2)                         
##  prettyunits      1.1.1      2020-01-24 [2] CRAN (R 4.0.2)                         
##  processx         3.4.2      2020-02-09 [2] CRAN (R 4.0.2)                         
##  promises         1.1.1      2020-06-09 [2] CRAN (R 4.0.2)                         
##  ps               1.3.3      2020-05-08 [2] CRAN (R 4.0.2)                         
##  purrr          * 0.3.4      2020-04-17 [2] CRAN (R 4.0.2)                         
##  qpdf             1.1        2019-03-07 [1] CRAN (R 4.0.2)                         
##  qs             * 0.22.1     2020-06-10 [2] CRAN (R 4.0.2)                         
##  R6               2.4.1      2019-11-12 [2] CRAN (R 4.0.2)                         
##  ragg           * 0.4.0      2020-10-05 [1] CRAN (R 4.0.2)                         
##  RANN             2.6.1      2019-01-08 [2] CRAN (R 4.0.2)                         
##  RApiSerialize    0.1.0      2014-04-19 [2] CRAN (R 4.0.2)                         
##  rappdirs         0.3.1      2016-03-28 [2] CRAN (R 4.0.2)                         
##  raster           3.3-7      2020-06-27 [2] CRAN (R 4.0.2)                         
##  RColorBrewer     1.1-2      2014-12-07 [2] CRAN (R 4.0.2)                         
##  Rcpp             1.0.4.6    2020-04-09 [2] CRAN (R 4.0.2)                         
##  RcppParallel     5.0.2      2020-06-24 [2] CRAN (R 4.0.2)                         
##  readr          * 1.3.1      2018-12-21 [2] CRAN (R 4.0.2)                         
##  readxl         * 1.3.1      2019-03-13 [2] CRAN (R 4.0.2)                         
##  reprex           0.3.0      2019-05-16 [2] CRAN (R 4.0.2)                         
##  rgdal            1.5-12     2020-06-26 [2] CRAN (R 4.0.2)                         
##  RgoogleMaps      1.4.5.3    2020-02-12 [2] CRAN (R 4.0.2)                         
##  rio              0.5.16     2018-11-26 [2] CRAN (R 4.0.2)                         
##  rjson            0.2.20     2018-06-08 [2] CRAN (R 4.0.2)                         
##  rlang            0.4.6      2020-05-02 [2] CRAN (R 4.0.2)                         
##  rmarkdown        2.3        2020-06-18 [2] CRAN (R 4.0.2)                         
##  rpart            4.1-15     2019-04-12 [3] CRAN (R 4.0.0)                         
##  rprojroot        1.3-2      2018-01-03 [2] CRAN (R 4.0.2)                         
##  RSQLite          2.2.1      2020-09-30 [2] CRAN (R 4.0.2)                         
##  rstan          * 2.19.3     2020-02-11 [2] CRAN (R 4.0.2)                         
##  rstatix          0.6.0      2020-06-18 [1] CRAN (R 4.0.2)                         
##  rstudioapi       0.11       2020-02-07 [2] CRAN (R 4.0.2)                         
##  rvest            0.3.5      2019-11-08 [2] CRAN (R 4.0.2)                         
##  sandwich         2.5-1      2019-04-06 [2] CRAN (R 4.0.2)                         
##  scales         * 1.1.1      2020-05-11 [2] CRAN (R 4.0.2)                         
##  sessioninfo      1.1.1      2018-11-05 [2] CRAN (R 4.0.2)                         
##  sf             * 0.9-6      2020-09-13 [2] CRAN (R 4.0.2)                         
##  shiny            1.5.0      2020-06-23 [2] CRAN (R 4.0.2)                         
##  sp             * 1.4-2      2020-05-20 [2] CRAN (R 4.0.2)                         
##  spatialreg     * 1.1-5      2019-12-01 [1] CRAN (R 4.0.2)                         
##  spData         * 0.3.5      2020-04-06 [2] CRAN (R 4.0.2)                         
##  spdep          * 1.1-5      2020-06-29 [2] CRAN (R 4.0.2)                         
##  StanHeaders    * 2.21.0-6   2020-08-16 [1] CRAN (R 4.0.2)                         
##  statmod          1.4.34     2020-02-17 [2] CRAN (R 4.0.2)                         
##  stringfish       0.12.1     2020-06-05 [2] CRAN (R 4.0.2)                         
##  stringi          1.5.3      2020-09-09 [1] CRAN (R 4.0.2)                         
##  stringr        * 1.4.0      2019-02-10 [2] CRAN (R 4.0.2)                         
##  survival         3.1-12     2020-04-10 [3] CRAN (R 4.0.0)                         
##  systemfonts      0.3.2      2020-09-29 [1] CRAN (R 4.0.2)                         
##  textshaping      0.1.2      2020-10-08 [1] CRAN (R 4.0.2)                         
##  TH.data          1.0-10     2019-01-21 [2] CRAN (R 4.0.2)                         
##  tibble         * 3.0.1      2020-04-20 [2] CRAN (R 4.0.2)                         
##  tidycensus     * 0.10.2     2020-09-28 [1] CRAN (R 4.0.2)                         
##  tidyr          * 1.1.0      2020-05-20 [2] CRAN (R 4.0.2)                         
##  tidyselect       1.1.0      2020-05-11 [2] CRAN (R 4.0.2)                         
##  tidyverse      * 1.3.0      2019-11-21 [2] CRAN (R 4.0.2)                         
##  tigris           0.9.4      2020-04-01 [2] CRAN (R 4.0.2)                         
##  units            0.6-7      2020-06-13 [2] CRAN (R 4.0.2)                         
##  utf8             1.1.4      2018-05-24 [2] CRAN (R 4.0.2)                         
##  uuid             0.1-4      2020-02-26 [2] CRAN (R 4.0.2)                         
##  vctrs            0.3.1      2020-06-05 [2] CRAN (R 4.0.2)                         
##  viridisLite      0.3.0      2018-02-01 [2] CRAN (R 4.0.2)                         
##  webshot          0.5.2      2019-11-22 [2] CRAN (R 4.0.2)                         
##  withr            2.2.0      2020-04-20 [2] CRAN (R 4.0.2)                         
##  xfun             0.15       2020-06-21 [2] CRAN (R 4.0.2)                         
##  xgboost          1.1.1.1    2020-06-14 [2] CRAN (R 4.0.2)                         
##  xml2             1.3.2      2020-04-23 [2] CRAN (R 4.0.2)                         
##  xtable           1.8-4      2019-04-21 [2] CRAN (R 4.0.2)                         
##  yaml             2.2.1      2020-02-01 [2] CRAN (R 4.0.2)                         
##  zip            * 2.0.4      2019-09-01 [2] CRAN (R 4.0.2)                         
##  zoo              1.8-8      2020-05-02 [2] CRAN (R 4.0.2)                         
## 
## [1] /home/rushj03/R/x86_64-pc-linux-gnu-library/4.0
## [2] /usr/lib/R/site-library
## [3] /usr/lib/R/library